mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
scale by 1 for neon implemented
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fcvtn v4.4h, v2.4s \n" // 8 floatsgit
"fcvtn2 v4.8h, v1.4s \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v4"
);
}
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
asm volatile (
"1: \n"
MEMACCESS(0)
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"uxtl2 v1.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v1.4s, v1.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v1.4s, v1.4s, %3.s[0] \n"
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v4.8h, v1.4s, #13 \n"
MEMACCESS(1)
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v4"
);
}
TEST=LibYUVPlanarTest.TestHalfFloatPlane_One
BUG=libyuv:560
R=hubbe@chromium.org
Review URL: https://codereview.chromium.org/2430313008 .
This commit is contained in:
parent
550cf829fb
commit
451af5e922
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1629
|
||||
Version: 1630
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
|
||||
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloat1Row_Any_F16C(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloat1Row_Any_NEON(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
const uint8* luma, uint32 lumacoeff);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1629
|
||||
#define LIBYUV_VERSION 1630
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_F16C)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_F16C;
|
||||
HalfFloatRow = (scale == 1.0f) ?
|
||||
HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
HalfFloatRow = HalfFloatRow_F16C;
|
||||
HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_NEON;
|
||||
HalfFloatRow = (scale == 1.0f) ?
|
||||
HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
HalfFloatRow = HalfFloatRow_NEON;
|
||||
HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
|
||||
}
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
|
||||
ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_NEON
|
||||
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7)
|
||||
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7)
|
||||
#endif
|
||||
#undef ANY11P16
|
||||
|
||||
|
||||
@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_F16C
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
|
||||
asm volatile (
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
|
||||
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"vcvtdq2ps %%ymm2,%%ymm2 \n"
|
||||
"vcvtdq2ps %%ymm3,%%ymm3 \n"
|
||||
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
|
||||
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
|
||||
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
|
||||
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "memory", "cc",
|
||||
"xmm2", "xmm3"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_F16C
|
||||
|
||||
#ifdef HAS_ARGBCOLORTABLEROW_X86
|
||||
// Tranform ARGB pixels with color table.
|
||||
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
|
||||
|
||||
@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
"uxtl2 v1.4s, v1.8h \n"
|
||||
"scvtf v2.4s, v2.4s \n" // 8 floats
|
||||
"scvtf v1.4s, v1.4s \n"
|
||||
"fcvtn v4.4h, v2.4s \n" // 8 floatsgit
|
||||
"fcvtn2 v4.8h, v1.4s \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v1", "v2", "v4"
|
||||
);
|
||||
}
|
||||
|
||||
void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v2.4s, v1.4h \n" // 8 int's
|
||||
"uxtl2 v1.4s, v1.8h \n"
|
||||
"scvtf v2.4s, v2.4s \n" // 8 floats
|
||||
"scvtf v1.4s, v1.4s \n"
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n"
|
||||
"uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat
|
||||
"uqshrn2 v4.8h, v1.4s, #13 \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v4.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale * 1.9259299444e-34f) // %3
|
||||
: "cc", "memory", "v1", "v2", "v4"
|
||||
);
|
||||
}
|
||||
|
||||
void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
|
||||
@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
|
||||
int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
int disable_cpu_flags, int benchmark_cpu_info,
|
||||
float scale) {
|
||||
float scale, int mask) {
|
||||
int i, j;
|
||||
const int y_plane_size = benchmark_width * benchmark_height * 2;
|
||||
|
||||
align_buffer_page_end(orig_y, y_plane_size);
|
||||
align_buffer_page_end(dst_c, y_plane_size);
|
||||
align_buffer_page_end(dst_opt, y_plane_size);
|
||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
||||
uint8* dst_opt = orig_y + y_plane_size;
|
||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
||||
|
||||
MemRandomize(orig_y, y_plane_size);
|
||||
memset(dst_c, 0, y_plane_size);
|
||||
memset(dst_opt, 1, y_plane_size);
|
||||
|
||||
for (i = 0; i < y_plane_size / 2; ++i) {
|
||||
reinterpret_cast<uint16*>(orig_y)[i] = static_cast<uint16>(i & mask);
|
||||
}
|
||||
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(disable_cpu_flags);
|
||||
double c_time = get_time();
|
||||
@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height,
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_y);
|
||||
free_aligned_buffer_page_end(dst_c);
|
||||
free_aligned_buffer_page_end(dst_opt);
|
||||
return diff;
|
||||
}
|
||||
|
||||
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
|
||||
// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
|
||||
// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_denormal) {
|
||||
#define MAXHALFDIFF 0
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f / 65536.0f);
|
||||
EXPECT_EQ(diff, 0);
|
||||
1.0f / 65536.0f, 65535);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f / 1024.0f, 1023);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f / 512.0f, 511);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f / 4096.0f);
|
||||
EXPECT_EQ(diff, 0);
|
||||
1.0f / 4096.0f, 4095);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f, 4095);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_,
|
||||
1.0f / 1023.0f);
|
||||
EXPECT_EQ(diff, 0);
|
||||
1.0f / 4095.0f, 4095);
|
||||
EXPECT_LE(diff, MAXHALFDIFF);
|
||||
}
|
||||
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
|
||||
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
|
||||
SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user