From 7fc932ddd306c11493a27b65fdd042ae15be79bf Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 29 Sep 2016 15:06:30 -0700 Subject: [PATCH] Add low level support for 12 bit 420, 422 and 444 YUV video frame conversion. BUG=libyuv:560,chromium:445071 TEST=untested R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2371293002 . --- include/libyuv/planar_functions.h | 8 ++++++ include/libyuv/row.h | 8 ++++-- source/planar_functions.cc | 46 +++++++++++++++++++++++++++++++ source/row_any.cc | 22 +++++++++++++++ source/row_common.cc | 19 +++++++++++++ source/row_gcc.cc | 33 ++++++++++++++++++++++ source/row_win.cc | 23 ++++++++-------- unit_test/planar_test.cc | 40 +++++++++++++++++++++++++++ 8 files changed, 185 insertions(+), 14 deletions(-) diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 9b0f994b0..1b57b2926 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -281,6 +281,14 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, const float* poly, int width, int height); +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + float scale, + int width, int height); + // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 227156a19..0b4eec92f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -231,6 +231,7 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 +#define HAS_HALFFLOATROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -252,7 +253,6 @@ extern "C" { #define HAS_ARGBTORGB565ROW_AVX2 #define HAS_J400TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 -#define HAS_SHORTTOF16ROW_AVX2 #endif // The following are also available on x64 Visual C. @@ -1934,8 +1934,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, int width); // Scale and convert to half float. -void ShortToF16Row_C(const uint16* src, int16* dst, float scale, int width); -void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width); +void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale, + int width); void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b919e9615..20e9c66c0 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -83,6 +83,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } // TODO(fbarchard): Consider support for negative height. +// TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API void CopyPlane_16(const uint16* src_y, int src_stride_y, uint16* dst_y, int dst_stride_y, @@ -2441,6 +2442,51 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, return 0; } +// Convert plane of 16 bit shorts to half floats. +// Source values are multiplied by scale before storing as half float. +LIBYUV_API +int HalfFloatPlane(const uint16* src_y, int src_stride_y, + uint16* dst_y, int dst_stride_y, + float scale, + int width, int height) { + int y; + void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) = + HalfFloatRow_C; + if (!src_y || !dst_y || width <= 0 || height == 0) { + return -1; + } + src_stride_y >>= 1; + dst_stride_y >>= 1; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_HALFFLOATROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HalfFloatRow = HalfFloatRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_AVX2; + } + } +#endif + + for (y = 0; y < height; ++y) { + HalfFloatRow(src_y, dst_y, scale, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, diff --git a/source/row_any.cc b/source/row_any.cc index 28b6758fc..0a978e987 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -546,6 +546,28 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) #endif #undef ANY11P +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, \ + T shuffler, int width) { \ + SIMD_ALIGNED(uint16 temp[32 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ + } + +#ifdef HAS_HALFFLOATROW_AVX2 +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) +#endif +#undef ANY11P16 + + // Any 1 to 1 with yuvconstants #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ diff --git a/source/row_common.cc b/source/row_common.cc index 099ab600d..e194e6cd1 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2333,6 +2333,25 @@ void ARGBPolynomialRow_C(const uint8* src_argb, } } +// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor +// adjust the source integer range to the half float range desired. + +// This magic constant is 2^-112. Multiplying by this +// is the same as subtracting 112 from the exponent, which +// is the difference in exponent bias between 32-bit and +// 16-bit floats. Once we've done this subtraction, we can +// simply extract the low bits of the exponent and the high +// bits of the mantissa from our float and we're done. + +void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) { + int i; + float mult = 1.9259299444e-34f * scale; + for (i = 0; i < width; ++i) { + float value = src[i] * mult; + dst[i] = (uint16)((*(uint32_t*)&value) >> 13); + } +} + void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff) { uint32 bc = lumacoeff & 0xff; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 1ac7ef1aa..e4b4c5c1b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5366,6 +5366,39 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_AVX2 +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints + "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more + "lea " MEMLEA(0x20,0) ",%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" + "vcvtdq2ps %%ymm1,%%ymm1 \n" + "vmulps %%ymm0,%%ymm4,%%ymm0 \n" + "vmulps %%ymm1,%%ymm4,%%ymm1 \n" + "vcvtps2ph $3, %%ymm0, %%xmm0 \n" + "vcvtps2ph $3, %%ymm1, %%xmm1 \n" + "vmovdqu %%xmm0," MEMACCESS(1) " \n" + "vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale) // %3 + : "memory", "cc", + "xmm0", "xmm4" + ); +} +#endif // HAS_HALFFLOATROW_AVX2 + #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, diff --git a/source/row_win.cc b/source/row_win.cc index d54f05e29..baf6c940a 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 -// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor -// adjust the sample range to 0 to 1 using a float multiply. -// e.g. 9 bit scale is 1.0f / 512.0f -// e.g. 10 bit scale is 1.0f / 1024.0f -#ifdef HAS_SHORTTOHALFFLOAT_AVX2 +#ifdef HAS_HALFFLOATROW_AVX2 __declspec(naked) -void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) { +void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { __asm { mov eax, [esp + 4] /* src */ mov edx, [esp + 8] /* dst */ @@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) { // 8 pixel loop. convertloop: vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints - lea eax, [eax + 16] + vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts + lea eax, [eax + 32] vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats + vcvtdq2ps ymm1, ymm1 vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1 - vcvtps2ph xmm0, ymm0, 0 // float conver to 8 half floats round even + vmulps ymm1, ymm1, ymm4 + vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm1, ymm1, 3 vmovdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 8 + vmovdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 16 jg convertloop vzeroupper ret } } -#endif // HAS_SHORTTOHALFFLOAT_AVX2 +#endif // HAS_HALFFLOATROW_AVX2 #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index d30d6b2e1..722074f73 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2081,6 +2081,46 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) { } } +TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) { + int i, j; + const int y_plane_size = benchmark_width_ * benchmark_height_ * 2; + + align_buffer_page_end(orig_y, y_plane_size); + align_buffer_page_end(dst_c, y_plane_size); + align_buffer_page_end(dst_opt, y_plane_size); + MemRandomize(orig_y, y_plane_size); + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 1, y_plane_size); + + // Disable all optimizations. + MaskCpuFlags(disable_cpu_flags_); + double c_time = get_time(); + for (j = 0; j < benchmark_iterations_; j++) { + HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2, + (uint16*)dst_c, benchmark_width_ * 2, + 1.0f / 4096.0f, benchmark_width_, benchmark_height_); + } + c_time = (get_time() - c_time) / benchmark_iterations_; + + // Enable optimizations. + MaskCpuFlags(benchmark_cpu_info_); + double opt_time = get_time(); + for (j = 0; j < benchmark_iterations_; j++) { + HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2, + (uint16*)dst_opt, benchmark_width_ * 2, + 1.0f / 4096.0f, benchmark_width_, benchmark_height_); + } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + + for (i = 0; i < y_plane_size; ++i) { + EXPECT_EQ(dst_c[i], dst_opt[i]); + } + + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(dst_c); + free_aligned_buffer_page_end(dst_opt); +} + TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) { SIMD_ALIGNED(uint8 orig_pixels[1280][4]); SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);