From 451af5e922e026c266d25abc92e7519acfc9a4c5 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 21 Oct 2016 14:30:03 -0700 Subject: [PATCH] scale by 1 for neon implemented void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile ( "1: \n" MEMACCESS(0) "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "uxtl2 v1.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v1.4s, v1.4s \n" "fcvtn v4.4h, v2.4s \n" // 8 floatsgit "fcvtn2 v4.8h, v1.4s \n" MEMACCESS(1) "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v4" ); } void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "1: \n" MEMACCESS(0) "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "uxtl2 v1.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v1.4s, v1.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent "fmul v1.4s, v1.4s, %3.s[0] \n" "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat "uqshrn2 v4.8h, v1.4s, #13 \n" MEMACCESS(1) "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 : "cc", "memory", "v1", "v2", "v4" ); } TEST=LibYUVPlanarTest.TestHalfFloatPlane_One BUG=libyuv:560 R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2430313008 . --- README.chromium | 2 +- include/libyuv/row.h | 6 +++++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 10 ++++--- source/row_any.cc | 4 ++- source/row_gcc.cc | 30 +++++++++++++++++++++ source/row_neon64.cc | 49 +++++++++++++++++++++++++++++++++ unit_test/planar_test.cc | 55 +++++++++++++++++++++++++++++--------- 8 files changed, 138 insertions(+), 20 deletions(-) diff --git a/README.chromium b/README.chromium index 6e66021d9..c14ba8429 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1629 +Version: 1630 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 96861befb..601e05acc 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -1959,9 +1959,15 @@ void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale, void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_Any_F16C(const uint16* src, uint16* dst, float scale, + int width); void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_Any_NEON(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float scale, int width); +void HalfFloat1Row_Any_NEON(const uint16* src, uint16* dst, float scale, + int width); void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 75406bd7f..17a9c6660 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1629 +#define LIBYUV_VERSION 1630 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 143ae869d..7a10a69f7 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2579,17 +2579,19 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y, #endif #if defined(HAS_HALFFLOATROW_F16C) if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { - HalfFloatRow = HalfFloatRow_Any_F16C; + HalfFloatRow = (scale == 1.0f) ? + HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; if (IS_ALIGNED(width, 16)) { - HalfFloatRow = HalfFloatRow_F16C; + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; } } #endif #if defined(HAS_HALFFLOATROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - HalfFloatRow = HalfFloatRow_Any_NEON; + HalfFloatRow = (scale == 1.0f) ? + HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - HalfFloatRow = HalfFloatRow_NEON; + HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; } } #endif diff --git a/source/row_any.cc b/source/row_any.cc index ec0aa21d7..07e606c6e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -577,16 +577,18 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) } #ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15) +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 7) #endif #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) #endif #ifdef HAS_HALFFLOATROW_F16C ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15) +ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 1, 1, 15) #endif #ifdef HAS_HALFFLOATROW_NEON ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 1, 1, 7) +ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 1, 1, 7) #endif #undef ANY11P16 diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 8020108d0..bc15c7719 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5410,6 +5410,36 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { } #endif // HAS_HALFFLOATROW_F16C +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints + "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2," MEMACCESS(1) " \n" + "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", + "xmm2", "xmm3" + ); +} +#endif // HAS_HALFFLOATROW_F16C + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3d122680e..9508d4656 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2711,6 +2711,55 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ); } + +void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v1.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v1.4s, v1.4s \n" + "fcvtn v4.4h, v2.4s \n" // 8 floatsgit + "fcvtn2 v4.8h, v1.4s \n" + MEMACCESS(1) + "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v1", "v2", "v4" + ); +} + +void HalfFloatRow_NEON2(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v2.4s, v1.4h \n" // 8 int's + "uxtl2 v1.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v1.4s, v1.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent + "fmul v1.4s, v1.4s, %3.s[0] \n" + "uqshrn v4.4h, v2.4s, #13 \n" // isolate halffloat + "uqshrn2 v4.8h, v1.4s, #13 \n" + MEMACCESS(1) + "st1 {v4.16b}, [%1], #16 \n" // store 8 shorts + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale * 1.9259299444e-34f) // %3 + : "cc", "memory", "v1", "v2", "v4" + ); +} + void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "1: \n" diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index a2eb1faac..c017c26a3 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2084,17 +2084,22 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { int TestHalfFloatPlane(int benchmark_width, int benchmark_height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, - float scale) { + float scale, int mask) { int i, j; const int y_plane_size = benchmark_width * benchmark_height * 2; - align_buffer_page_end(orig_y, y_plane_size); - align_buffer_page_end(dst_c, y_plane_size); - align_buffer_page_end(dst_opt, y_plane_size); + align_buffer_page_end(orig_y, y_plane_size * 3); + uint8* dst_opt = orig_y + y_plane_size; + uint8* dst_c = orig_y + y_plane_size * 2; + MemRandomize(orig_y, y_plane_size); memset(dst_c, 0, y_plane_size); memset(dst_opt, 1, y_plane_size); + for (i = 0; i < y_plane_size / 2; ++i) { + reinterpret_cast(orig_y)[i] = static_cast(i & mask); + } + // Disable all optimizations. MaskCpuFlags(disable_cpu_flags); double c_time = get_time(); @@ -2122,38 +2127,62 @@ int TestHalfFloatPlane(int benchmark_width, int benchmark_height, } free_aligned_buffer_page_end(orig_y); - free_aligned_buffer_page_end(dst_c); - free_aligned_buffer_page_end(dst_opt); return diff; } // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally // happen since scale is 1/(1<