From 6f729fbe658a40dfd993fa8b22bd612bb17cde5c Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 16 Jun 2025 16:56:37 -0700 Subject: [PATCH] ARGBToUV SSE use average of 4 pixels - Was using avgb twice for non-exact and C for exact. On Skylake Xeon: Now SSE3 ARGBToJ420_Opt (326 ms) Was Exact C ARGBToJ420_Opt (871 ms) Not exact AVX2 ARGBToJ420_Opt (237 ms) Not exact SSSE3 ARGBToJ420_Opt (312 ms) Bug: 381138208 Change-Id: I6d1081bb52e36f06736c0c6575fa82bb2268629b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6629605 Reviewed-by: Frank Barchard Reviewed-by: Ben Weiss --- README.chromium | 2 +- include/libyuv/row.h | 23 ++- include/libyuv/version.h | 2 +- source/row_common.cc | 111 --------------- source/row_gcc.cc | 251 +++++++++------------------------ source/row_win.cc | 10 +- unit_test/convert_argb_test.cc | 80 +++++++++++ 7 files changed, 164 insertions(+), 315 deletions(-) diff --git a/README.chromium b/README.chromium index e8d8d566b..021887717 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1912 +Version: 1913 License: BSD-3-Clause License File: LICENSE Shipped: yes diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b7114cb21..e45f04cbb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -79,12 +79,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 -#define HAS_BGRATOUVROW_SSSE3 -#define HAS_RGBATOUVROW_SSSE3 -#endif // Effects: #define HAS_ARGBADDROW_SSE2 @@ -272,11 +266,13 @@ extern "C" { #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 -#if !defined(LIBYUV_BIT_EXACT) // TODO: adjust row_win to use 8 bit negative coefficients. #define HAS_ABGRTOUVJROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 -#endif +#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_RGBATOUVROW_SSSE3 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -350,12 +346,11 @@ extern "C" { #define HAS_SPLITXRGBROW_AVX2 #define HAS_SWAPUVROW_AVX2 #define HAS_YUY2TONVUVROW_AVX2 -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ABGRTOUVJROW_AVX2 -#define HAS_ABGRTOUVROW_AVX2 -#define HAS_ARGBTOUVJROW_AVX2 -#define HAS_ARGBTOUVROW_AVX2 -#endif +// TODO: Port SSSE3 to AVX2 +// #define HAS_ABGRTOUVJROW_AVX2 +// #define HAS_ABGRTOUVROW_AVX2 +// #define HAS_ARGBTOUVJROW_AVX2 +// #define HAS_ARGBTOUVROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8cd8ee6e4..fca53bd93 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1912 +#define LIBYUV_VERSION 1913 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index 7101ec321..3062fd8f9 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -36,10 +36,6 @@ extern "C" { // LIBYUV_UNLIMITED_BT709 // LIBYUV_UNLIMITED_BT2020 -#if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ - defined(__i386__) || defined(_M_IX86)) -#define LIBYUV_ARGBTOUV_PAVGB 1 -#endif #if defined(LIBYUV_BIT_EXACT) #define LIBYUV_UNATTENUATE_DUP 1 #endif @@ -627,7 +623,6 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, // b -0.1406 * 128 = −17.9968 = -18 // g -0.7344 * 128 = −94.0032 = -94 // r 0.875 * 128 = 112.0 = 112 - static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); } @@ -639,46 +634,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { } #define AVGB(a, b) (((a) + (b) + 1) >> 1) -// ARGBToY_C and ARGBToUV_C -// Intel version of UV mimic SSE/AVX which does 2 pavgb -#if defined(LIBYUV_ARGBTOUV_PAVGB) -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ - } -#else -// ARM version does average of 4 pixels with rounding #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ @@ -717,7 +672,6 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { dst_v[0] = RGBToV(ar, ag, ab); \ } \ } -#endif MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -756,45 +710,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { } // ARGBToYJ_C and ARGBToUVJ_C -// Intel version mimic SSE/AVX which does 2 pavgb -#if defined(LIBYUV_ARGBTOUV_PAVGB) -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ - src_rgb += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ - AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ - AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ - AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ - } -#else -// ARM version does average of 4 pixels with rounding #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ @@ -834,8 +749,6 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { } \ } -#endif - MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) @@ -928,19 +841,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, g3 = STATIC_CAST(uint8_t, (g3 << 2) | (g3 >> 4)); r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); -#if defined(LIBYUV_ARGBTOUV_PAVGB) - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; dst_u[0] = RGBToU(r, g, b); dst_v[0] = RGBToV(r, g, b); -#endif src_rgb565 += 4; next_rgb565 += 4; @@ -1009,19 +914,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, g3 = STATIC_CAST(uint8_t, (g3 << 3) | (g3 >> 2)); r3 = STATIC_CAST(uint8_t, (r3 << 3) | (r3 >> 2)); -#if defined(LIBYUV_ARGBTOUV_PAVGB) - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; dst_u[0] = RGBToU(r, g, b); dst_v[0] = RGBToV(r, g, b); -#endif src_argb1555 += 4; next_argb1555 += 4; @@ -1087,19 +984,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, g3 = STATIC_CAST(uint8_t, (g3 << 4) | g3); r3 = STATIC_CAST(uint8_t, (r3 << 4) | r3); -#if defined(LIBYUV_ARGBTOUV_PAVGB) - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); -#else uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; dst_u[0] = RGBToU(r, g, b); dst_v[0] = RGBToV(r, g, b); -#endif src_argb4444 += 4; next_argb4444 += 4; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 6fa8261af..6af2a1a9b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1650,7 +1650,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %3 #else - "+rm"(width) // %3 + "+rm"(width) // %3 #endif : "m"(rgbuvconstants->kRGBToU), // %4 "m"(rgbuvconstants->kRGBToV), // %5 @@ -1721,7 +1721,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %3 #else - "+rm"(width) // %3 + "+rm"(width) // %3 #endif : "m"(rgbuvconstants->kRGBToU), // %4 "m"(rgbuvconstants->kRGBToV), // %5 @@ -1733,161 +1733,71 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #endif // HAS_ARGBTOUV444ROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 - -void OMITFP -ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstants* rgbuvconstants) { +// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V +// ARGBToUV does rounding average of 4 ARGB pixels +void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(rgbuvconstants->kRGBToU), // %0 - "m"(rgbuvconstants->kRGBToV), // %1 - "m"(kAddUV128) // %2 - : "xmm3", "xmm4", "xmm5"); - - asm volatile("sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "movdqa %%xmm5,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm6 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm6 \n" - "packuswb %%xmm6,%%xmm2 \n" - "movlps %%xmm2,(%1) \n" - "movhps %%xmm2,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "subl $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 -#if defined(__i386__) - "+m"(width) // %3 -#else - "+rm"(width) // %3 -#endif - : "r"((intptr_t)(src_stride_argb)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", - "xmm6", "xmm7"); -} - -#endif // HAS_ARGBTOUVROW_SSSE3 - -// vpshufb for vphaddw + vpackuswb packed to shorts. -// Coefficients expressed as negatives to allow 128 -struct UVMatrixConstants { - lvec8 kShufARGBToUV; - ulvec8 kAddUV128; -}; - -static const UVMatrixConstants kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, - 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128}; - -void OMITFP -ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstants* rgbuvconstants) { - asm volatile( - "vbroadcastf128 %0,%%ymm6 \n" - "vbroadcastf128 %1,%%ymm7 \n" - : - : "m"(rgbuvconstants->kRGBToU), // %0 - "m"(rgbuvconstants->kRGBToV) // %1 - :); - - asm volatile( -#if !defined(__i386__) - "vmovdqa 0(%5),%%ymm8 \n" -#endif - "vmovdqa 32(%5),%%ymm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 + "pabsb %%xmm4,%%xmm4 \n" + "movdqa %5,%%xmm6 \n" // ARGB to U + "movdqa %6,%%xmm7 \n" // ARGB to V "sub %1,%2 \n" LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + "movdqu (%0),%%xmm0 \n" // Read 8 ARGB Pixels + "movdqu 0x10(%0),%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "shufps $0x88,%%xmm5,%%xmm0 \n" // Even pixels + "shufps $0xdd,%%xmm5,%%xmm1 \n" // Odd pixels + "movdqa %%xmm0,%%xmm5 \n" + "punpcklbw %%xmm1,%%xmm0 \n" // aarrgbb + "punpckhbw %%xmm5,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // paired add argb + "pmaddubsw %%xmm4,%%xmm1 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubw %%ymm0,%%ymm5,%%ymm0 \n" - "vpsubw %%ymm1,%%ymm5,%%ymm1 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" -#if defined(__i386__) - "vpshufb (%5),%%ymm0,%%ymm0 \n" -#else - "vpshufb %%ymm8,%%ymm0,%%ymm0 \n" -#endif - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "subl $0x20,%3 \n" + "movdqu 0x00(%0,%4,1),%%xmm2 \n" // Read 2nd row + "movdqu 0x10(%0,%4,1),%%xmm5 \n" + "movdqa %%xmm2,%%xmm3 \n" + "shufps $0x88,%%xmm5,%%xmm2 \n" // Even + "shufps $0xdd,%%xmm5,%%xmm3 \n" // Odd pixels + "movdqa %%xmm2,%%xmm5 \n" + "punpcklbw %%xmm3,%%xmm2 \n" // aarrgbb + "punpckhbw %%xmm5,%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" // argb + "pmaddubsw %%xmm4,%%xmm3 \n" + + "pxor %%xmm5,%%xmm5 \n" // constant 0 for pavgw + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $1,%%xmm0 \n" // round + "psrlw $1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" // 4 ARGB pixels + + "movdqa %%xmm0,%%xmm1 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" // u + "pmaddubsw %%xmm7,%%xmm1 \n" // v + "phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv + + "movdqa %7,%%xmm2 \n" // 0x8000 + "psubw %%xmm0,%%xmm2 \n" // unsigned 0 to 0xffff + "psrlw $0x8,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" // Write 4 U's + "shufps $0xdd,%%xmm2,%%xmm2 \n" + "movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's + + "lea 0x20(%0),%0 \n" + "lea 0x4(%1),%1 \n" + "subl $0x8,%3 \n" "jg 1b \n" - "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -1897,10 +1807,14 @@ ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, "+rm"(width) // %3 #endif : "r"((intptr_t)(src_stride_argb)), // %4 - "r"(&kShufARGBToUV_AVX) // %5 + "m"(rgbuvconstants->kRGBToU), // %5 + "m"(rgbuvconstants->kRGBToV), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } +#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 @@ -1944,14 +1858,6 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, &kARGBI601UVConstants); } -void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, - &kARGBI601UVConstants); -} static const struct RgbUVConstants kABGRI601UVConstants = { {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0}, @@ -1966,15 +1872,6 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, &kABGRI601UVConstants); } -void ABGRToUVRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kABGRI601UVConstants); -} - static const struct RgbUVConstants kBGRAI601UVConstants = { {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112}, {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}}; @@ -2035,28 +1932,10 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJ444ROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, - &kARGBJPEGUVConstants); -} - static const struct RgbUVConstants kABGRJPEGUVConstants = { {43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0}, {-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}}; -void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kABGRJPEGUVConstants); -} - #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, diff --git a/source/row_win.cc b/source/row_win.cc index 1a57ee4f5..5d4aec9cf 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // 32 bit #else // defined(_M_X64) -#ifdef HAS_ARGBTOUVROW_SSSE3 +// if HAS_ARGBTOUVROW_SSSE3 // 8 bit fixed point 0.5, for bias of UV. static const ulvec8 kBiasUV128 = { @@ -221,7 +221,7 @@ static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, // JPeg full range. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0}; -#endif +// endif // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; @@ -1208,6 +1208,8 @@ __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, } } +#ifdef HAS_ARGBTOUVROW_SSSE3 + // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, @@ -1244,6 +1246,7 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, ret } } +#endif #ifdef HAS_ARGBTOYROW_AVX2 @@ -1430,6 +1433,8 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, } } +#ifdef HAS_ARGBTOUVROW_SSSE3 + __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1578,6 +1583,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, ret } } +#endif #ifdef HAS_ARGBTOUVROW_AVX2 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 78a6c079a..dee8cb4c3 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -2721,6 +2721,86 @@ TEST_F(LibYUVConvertTest, TestUYVYToARGB) { EXPECT_EQ(3486643515u, checksum); } +#ifdef ENABLE_ROW_TESTS +TEST_F(LibYUVConvertTest, TestARGBToUVRow) { + SIMD_ALIGNED(uint8_t orig_argb_pixels[256]); + SIMD_ALIGNED(uint8_t dest_u[32]); + SIMD_ALIGNED(uint8_t dest_v[32]); + + for (int i = 0; i < 256; ++i) { + orig_argb_pixels[i] = i * 43; + } + + orig_argb_pixels[0] = 0xff; // blue + orig_argb_pixels[1] = 0x0; + orig_argb_pixels[2] = 0x0; + orig_argb_pixels[3] = 0xff; + orig_argb_pixels[4] = 0xff; // blue + orig_argb_pixels[5] = 0x0; + orig_argb_pixels[6] = 0x0; + orig_argb_pixels[7] = 0xff; + + orig_argb_pixels[8] = 0x0; + orig_argb_pixels[9] = 0xff; // green + orig_argb_pixels[10] = 0x0; + orig_argb_pixels[11] = 0xff; + orig_argb_pixels[12] = 0x0; + orig_argb_pixels[13] = 0xff; // green + orig_argb_pixels[14] = 0x0; + orig_argb_pixels[15] = 0xff; + + orig_argb_pixels[16] = 0x0; + orig_argb_pixels[17] = 0x0; + orig_argb_pixels[18] = 0xff; // red + orig_argb_pixels[19] = 0xff; + orig_argb_pixels[20] = 0x0; + orig_argb_pixels[21] = 0x0; + orig_argb_pixels[22] = 0xff; // red + orig_argb_pixels[23] = 0xff; + + orig_argb_pixels[24] = 0xff; + orig_argb_pixels[25] = 0xff; + orig_argb_pixels[26] = 0xff; // white + orig_argb_pixels[27] = 0xff; + orig_argb_pixels[28] = 0xff; + orig_argb_pixels[29] = 0xff; + orig_argb_pixels[30] = 0xff; // white + orig_argb_pixels[31] = 0xff; + + int benchmark_iterations = + benchmark_width_ * benchmark_height_ * benchmark_iterations_ / 32; + + for (int i = 0; i < benchmark_iterations; ++i) { +#if defined(HAS_ARGBTOUVROW_SSSE3) + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + ARGBToUVRow_SSSE3(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64); + } else { + ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64); + } +#elif defined(HAS_ARGBTOUVROW_NEON) + ARGBToUVRow_NEON(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64); +#else + ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64); +#endif + } + printf("u: "); + for (int i = 0; i < 32; ++i) { + printf("%3d ", (int)dest_u[i]); + } + printf("\nv: "); + for (int i = 0; i < 32; ++i) { + printf("%3d ", (int)dest_v[i]); + } + printf("\n"); + + uint32_t checksum_u = HashDjb2(&dest_u[0], sizeof(dest_u), 5381); + EXPECT_EQ(192508756u, checksum_u); + uint32_t checksum_v = HashDjb2(&dest_v[0], sizeof(dest_v), 5381); + EXPECT_EQ(2590663990u, checksum_v); +} +#endif + #if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) TEST_F(LibYUVConvertTest, TestI400LargeSize) { // The width and height are chosen as follows: