From 00950840d1c9bcbb3eb6ebc5aac5793e71166c8b Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 30 Sep 2022 15:12:37 -0700 Subject: [PATCH] YUY2ToNV12 using YUY2ToY and YUY2ToNVUV - Optimized YUY2ToNV12 that reduces it from 3 steps to 2 steps - Was SplitUV, memcpy Y, InterpolateUV - Now YUY2ToY, YUY2ToNVUV - rollback LIBYUV_UNLIMITED_DATA 3840x2160 1000 iterations: Pixel 2 Cortex A73 Was YUY2ToNV12_Opt (6515 ms) Now YUY2ToNV12_Opt (3350 ms) AB7 Mediatek P35 Cortex A53 Was YUY2ToNV12_Opt (6435 ms) Now YUY2ToNV12_Opt (3301 ms) Skylake AVX2 x64 Was YUY2ToNV12_Opt (1872 ms) Now YUY2ToNV12_Opt (1657 ms) SSE2 x64 Was YUY2ToNV12_Opt (2008 ms) Now YUY2ToNV12_Opt (1691 ms) Windows Skylake AVX2 32 bit x86 Was YUY2ToNV12_Opt (2161 ms) Now YUY2ToNV12_Opt (1628 ms) Bug: libyuv:943 Change-Id: I6c2ba2ae765413426baf770b837de114f808f6d0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3929843 Reviewed-by: Wan-Teh Chang Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- include/libyuv/row.h | 35 ++++++++- source/planar_functions.cc | 142 ++++++++++++++----------------------- source/row_any.cc | 29 ++++++++ source/row_common.cc | 31 ++++++-- source/row_gcc.cc | 56 +++++++++++++++ source/row_neon.cc | 23 ++++++ source/row_neon64.cc | 23 ++++++ unit_test/color_test.cc | 12 ++-- unit_test/convert_test.cc | 6 +- 9 files changed, 253 insertions(+), 104 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index c09d51bac..b7493370a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -236,8 +236,8 @@ extern "C" { #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 #define HAS_YUY2TOARGBROW_AVX2 -#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOYROW_AVX2 // Effects: @@ -326,6 +326,7 @@ extern "C" { #define HAS_SPLITXRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 +#define HAS_YUY2TONVUVROW_SSE2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -391,6 +392,7 @@ extern "C" { #define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 +#define HAS_YUY2TONVUVROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -524,8 +526,9 @@ extern "C" { #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON #define HAS_YUY2TOARGBROW_NEON -#define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TONVUVROW_NEON +#define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOYROW_NEON // Effects: @@ -4724,6 +4727,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4734,6 +4741,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4744,6 +4755,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4774,6 +4789,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, @@ -4784,6 +4803,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4794,6 +4817,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4804,6 +4831,10 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width); void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 45c34d307..67229ee7d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -5095,9 +5095,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y, return 0; } -// TODO(fbarchard): Consider if width is even Y channel can be split -// directly. A SplitUVRow_Odd function could copy the remaining chroma. - LIBYUV_API int YUY2ToNV12(const uint8_t* src_yuy2, int src_stride_yuy2, @@ -5108,13 +5105,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2, int width, int height) { int y; - int halfwidth = (width + 1) >> 1; - void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, - int width) = SplitUVRow_C; - void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = InterpolateRow_C; - + void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = + YUY2ToYRow_C; + void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2, + uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C; if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } @@ -5125,109 +5119,83 @@ int YUY2ToNV12(const uint8_t* src_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } -#if defined(HAS_SPLITUVROW_SSE2) +#if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - SplitUVRow = SplitUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_SSE2; + YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif -#if defined(HAS_SPLITUVROW_AVX2) +#if defined(HAS_YUY2TOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; } } #endif -#if defined(HAS_SPLITUVROW_NEON) +#if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_Any_NEON; + YUY2ToYRow = YUY2ToYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_NEON; + YUY2ToYRow = YUY2ToYRow_NEON; } } #endif -#if defined(HAS_SPLITUVROW_MSA) +#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { - SplitUVRow = SplitUVRow_Any_MSA; + YUY2ToYRow = YUY2ToYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_MSA; + YUY2ToYRow = YUY2ToYRow_MSA; } } #endif -#if defined(HAS_SPLITUVROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - SplitUVRow = SplitUVRow_Any_LSX; +#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + YUY2ToYRow = YUY2ToYRow_Any_LASX; if (IS_ALIGNED(width, 32)) { - SplitUVRow = SplitUVRow_LSX; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - InterpolateRow = InterpolateRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSSE3; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - InterpolateRow = InterpolateRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_AVX2; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - InterpolateRow = InterpolateRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_NEON; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - InterpolateRow = InterpolateRow_Any_MSA; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_MSA; - } - } -#endif -#if defined(HAS_INTERPOLATEROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - InterpolateRow = InterpolateRow_Any_LSX; - if (IS_ALIGNED(width, 32)) { - InterpolateRow = InterpolateRow_LSX; + YUY2ToYRow = YUY2ToYRow_LASX; } } #endif - { - int awidth = halfwidth * 2; - // row of y and 2 rows of uv - align_buffer_64(rows, awidth * 3); +#if defined(HAS_YUY2TONVUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2; + } + } +#endif +#if defined(HAS_YUY2TONVUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TONVUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToNVUVRow = YUY2ToNVUVRow_NEON; + } + } +#endif - for (y = 0; y < height - 1; y += 2) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, rows + awidth, awidth); - memcpy(dst_y, rows, width); - SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); - memcpy(dst_y + dst_stride_y, rows, width); - InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); - src_yuy2 += src_stride_yuy2 * 2; - dst_y += dst_stride_y * 2; - dst_uv += dst_stride_uv; - } - if (height & 1) { - // Split Y from UV. - SplitUVRow(src_yuy2, rows, dst_uv, awidth); - memcpy(dst_y, rows, width); - } - free_aligned_buffer_64(rows); + for (y = 0; y < height - 1; y += 2) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width); } return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index d6149ee52..413080fd1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -673,6 +673,35 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15) #endif #undef ANY21 +// Any 2 planes to 1 with stride +// width is measured in source pixels. 4 bytes contains 2 pixels +#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[32 * 3]); \ + memset(temp, 0, 32 * 2); /* for msan */ \ + int awidth = (width + 1) / 2; \ + int r = awidth & MASK; \ + int n = awidth & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \ + } \ + memcpy(temp, src_yuy2 + n * SBPP, r * SBPP); \ + memcpy(temp + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, 32, temp + 64, MASK + 1); \ + memcpy(dst_uv + n * BPP, temp + 64, r * BPP); \ + } + +#ifdef HAS_YUY2TONVUVROW_NEON +ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_SSE2 +ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7) +#endif +#ifdef HAS_YUY2TONVUVROW_AVX2 +ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15) +#endif + // Any 2 planes to 1 with yuvconstants #define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ diff --git a/source/row_common.cc b/source/row_common.cc index a177d8bb3..8bfa4b8c2 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -22,9 +22,13 @@ extern "C" { #endif // This macro controls YUV to RGB using unsigned math to extend range of -// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B. -// Enable this macro for backwards compatiability with limited range 0 to 2. -// LIBYUV_LIMITED_DATA +// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: +// LIBYUV_UNLIMITED_DATA + +// Macros to enable unlimited data for each colorspace +// LIBYUV_UNLIMITED_BT601 +// LIBYUV_UNLIMITED_BT709 +// LIBYUV_UNLIMITED_BT2020 // The following macro from row_win makes the C code match the row_win code, // which is 7 bit fixed point for ARGBToI420: @@ -1480,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. -#if !defined(LIBYUV_LIMITED_DATA) +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601) #define UB 129 /* round(2.018 * 64) */ #else #define UB 128 /* max(128, round(2.018 * 64)) */ @@ -1534,7 +1538,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) // KR = 0.2126, KB = 0.0722 // U and V contributions to R,G,B. -#if !defined(LIBYUV_LIMITED_DATA) +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709) #define UB 135 /* round(2.112 * 64) */ #else #define UB 128 /* max(128, round(2.112 * 64)) */ @@ -1588,7 +1592,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) // KR = 0.2627; KB = 0.0593 // U and V contributions to R,G,B. -#if !defined(LIBYUV_LIMITED_DATA) +#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020) #define UB 137 /* round(2.142 * 64) */ #else #define UB 128 /* max(128, round(2.142 * 64)) */ @@ -3094,6 +3098,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2, } } +// Filter 2 rows of YUY2 UV's (422) into UV (NV12). +void YUY2ToNVUVRow_C(const uint8_t* src_yuy2, + int src_stride_yuy2, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2 rows of YUY2. + int x; + for (x = 0; x < width; x += 2) { + dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_uv += 2; + } +} + // Copy row of YUY2 UV's (422) into U and V (422). void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 5a8a492d1..ad1c052e9 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -6739,6 +6739,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm5"); } +void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile(LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x00(%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} + void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, @@ -6939,6 +6966,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm5"); } +void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_uv), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(stride_yuy2)) // %3 + : "memory", "cc", "xmm0", "xmm1"); +} + void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, diff --git a/source/row_neon.cc b/source/row_neon.cc index df947e5e6..c0db6f1de 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1583,6 +1583,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, ); } +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2. + "vrhadd.u8 q4, q1, q3 \n" // average rows of UV + "vst1.8 {q4}, [%2]! \n" // store 8 UV. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", + "d7" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3cbd9b79b..880a5f060 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1808,6 +1808,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, ); } +void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, + int stride_yuy2, + uint8_t* dst_uv, + int width) { + const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; + asm volatile( + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row + "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV + "prfm pldl1keep, [%0, 448] \n" + "st1 {v4.16b}, [%2], #16 \n" // store 8 UV. + "b.gt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(src_yuy2b), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc index 06dd064b3..01267ff1e 100644 --- a/unit_test/color_test.cc +++ b/unit_test/color_test.cc @@ -32,10 +32,10 @@ namespace libyuv { #endif #define ERROR_R 1 #define ERROR_G 1 -#if defined(LIBYUV_LIMITED_DATA) -#define ERROR_B 18 -#else +#ifdef LIBYUV_UNLIMITED_DATA #define ERROR_B 1 +#else +#define ERROR_B 18 #endif #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ @@ -502,10 +502,10 @@ TEST_F(LibYUVColorTest, TestYUV) { YUVToRGB(240, 0, 0, &r1, &g1, &b1); EXPECT_EQ(57, r1); EXPECT_EQ(255, g1); -#if defined(LIBYUV_LIMITED_DATA) - EXPECT_EQ(5, b1); -#else +#ifdef LIBYUV_UNLIMITED_DATA EXPECT_EQ(3, b1); +#else + EXPECT_EQ(5, b1); #endif for (int i = 0; i < 256; ++i) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index dc6d7a723..47eff2ece 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -2620,10 +2620,10 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) { // Test result matches known hash value. uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381); -#if defined(LIBYUV_LIMITED_DATA) - EXPECT_EQ(dst_argb_hash, 2355976473u); -#else +#ifdef LIBYUV_UNLIMITED_DATA EXPECT_EQ(dst_argb_hash, 3900633302u); +#else + EXPECT_EQ(dst_argb_hash, 2355976473u); #endif free_aligned_buffer_page_end(dst_argb);