diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fac23d211..426fbb951 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -280,8 +280,11 @@ extern "C" { #define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOUVJROW_AVX2 #define HAS_ABGRTOUVROW_AVX2 +#if defined(__x86_64__) #define HAS_ABGRTOYJROW_AVX2 #define HAS_ABGRTOYROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#endif #define HAS_AR64TOARGBROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBTOAB64ROW_AVX2 @@ -295,8 +298,9 @@ extern "C" { #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOUVMATRIXROW_AVX2 #define HAS_ARGBTOUV444MATRIXROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 +#if defined(__x86_64__) +#define HAS_RGBTOYMATRIXROW_AVX2 +#endif #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 @@ -330,9 +334,11 @@ extern "C" { #define HAS_P210TOARGBROW_AVX2 #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 +#if defined(__x86_64__) #define HAS_RAWTOYJROW_AVX2 #define HAS_RGB24TOYJROW_AVX2 #define HAS_RGBATOYJROW_AVX2 +#endif #define HAS_SPLITARGBROW_AVX2 #define HAS_SPLITRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 @@ -354,6 +360,7 @@ extern "C" { defined(_M_X64) || defined(_M_X86)) && \ ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) +#if defined(__x86_64__) || defined(_M_X64) #define HAS_ARGBTOYROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 @@ -362,6 +369,7 @@ extern "C" { #define HAS_RGBATOYROW_AVX2 #define HAS_BGRATOYROW_AVX2 #endif +#endif // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C @@ -1857,6 +1865,10 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void RGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1866,6 +1878,10 @@ void ARGBToYMatrixRow_Any_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void RGBToYMatrixRow_Any_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); #endif void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, @@ -2220,6 +2236,12 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); +#if defined(HAS_RGBTOYMATRIXROW_AVX2) +void RGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +#endif void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -2232,6 +2254,12 @@ void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); +#if defined(HAS_RGBTOYMATRIXROW_AVX2) +void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +#endif void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width, diff --git a/source/convert.cc b/source/convert.cc index 79c1e16d6..4a13972be 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2925,6 +2925,7 @@ int RGBAToI420(const uint8_t* src_rgba, // Enabled if 1 pass is available #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_LSX) || \ + defined(HAS_RGB24TOYROW_AVX2) || \ defined(HAS_RGB24TOYROW_RVV)) #define HAS_RGB24TOYROW #endif @@ -2969,6 +2970,16 @@ int RGB24ToI420(const uint8_t* src_rgb24, #if defined(HAS_RGB24TOYROW) +#if defined(HAS_RGB24TOYROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // TODO(fbarchard): Write an AVX2 function for RGB24ToUVRow. + RGB24ToYRow = RGB24ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToYRow = RGB24ToYRow_AVX2; + } + } +#endif + // Neon version does direct RGB24 to YUV. #if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { @@ -3112,7 +3123,7 @@ int RGB24ToI420(const uint8_t* src_rgb24, #undef HAS_RGB24TOYROW // Enabled if 1 pass is available -#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_RVV) +#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_AVX2) || defined(HAS_RGB24TOYJROW_RVV) #define HAS_RGB24TOYJROW #endif @@ -3156,6 +3167,15 @@ int RGB24ToJ420(const uint8_t* src_rgb24, #if defined(HAS_RGB24TOYJROW) +#if defined(HAS_RGB24TOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToYJRow = RGB24ToYJRow_AVX2; + } + } +#endif + // Neon version does direct RGB24 to YUV. #if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { @@ -3288,6 +3308,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Enabled if 1 pass is available #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_LSX) || \ + defined(HAS_RAWTOYROW_AVX2) || \ defined(HAS_RAWTOYROW_RVV)) #define HAS_RAWTOYROW #endif @@ -3331,6 +3352,16 @@ int RAWToI420(const uint8_t* src_raw, #if defined(HAS_RAWTOYROW) +#if defined(HAS_RAWTOYROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // TODO(fbarchard): Write an AVX2 function for RAWToUVRow. + RAWToYRow = RAWToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToYRow = RAWToYRow_AVX2; + } + } +#endif + // Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { @@ -3482,7 +3513,7 @@ int RAWToI420(const uint8_t* src_raw, #undef HAS_RAWTOYROW // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_AVX2) || defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -3526,6 +3557,15 @@ int RAWToJ420(const uint8_t* src_raw, #if defined(HAS_RAWTOYJROW) +#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToYJRow = RAWToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_AVX2; + } + } +#endif + // Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { @@ -4717,7 +4757,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24TOYJROW_AVX2) +#if defined(HAS_RGB24TOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -4798,7 +4838,7 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOYJROW_AVX2) +#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RAWToYJRow = RAWToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 9428f1439..c72f77e93 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -3979,7 +3979,7 @@ int ARGBToAB64(const uint8_t* src_argb, } // Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV) +#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_AVX2) || defined(HAS_RAWTOYJROW_RVV) #define HAS_RAWTOYJROW #endif @@ -4024,6 +4024,16 @@ int RAWToJNV21(const uint8_t* src_raw, #if defined(HAS_RAWTOYJROW) +#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // TODO(fbarchard): Write an AVX2 function for RAWToUVJRow. + RAWToYJRow = RAWToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToYJRow = RAWToYJRow_AVX2; + } + } +#endif + // Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { diff --git a/source/row_any.cc b/source/row_any.cc index 8ac48d3c0..fdef98599 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1202,6 +1202,9 @@ ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31) #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15) #endif +#ifdef HAS_RGB24TOYROW_AVX2 +ANY11(RGB24ToYRow_Any_AVX2, RGB24ToYRow_AVX2, 0, 3, 1, 31) +#endif #ifdef HAS_RGB24TOYJROW_AVX2 ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) #endif @@ -1226,6 +1229,9 @@ ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31) #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15) #endif +#ifdef HAS_RAWTOYROW_AVX2 +ANY11(RAWToYRow_Any_AVX2, RAWToYRow_AVX2, 0, 3, 1, 31) +#endif #ifdef HAS_RAWTOYJROW_AVX2 ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) #endif @@ -2315,8 +2321,9 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) #ifdef HAS_ARGBTOYROW_SSSE3 ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15) #endif -#ifdef HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ARGBTOYMATRIXROW_AVX2 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31) +ANY11MC(RGBToYMatrixRow_Any_AVX2, RGBToYMatrixRow_AVX2, 3, 31) #endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63) diff --git a/source/row_common.cc b/source/row_common.cc index 8b192a539..3fac1d5f4 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4377,42 +4377,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif -#ifdef HAS_RGB24TOYJROW_AVX2 -// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. -void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); - ARGBToYJRow_AVX2(row, dst_yj, twidth); - src_rgb24 += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RGB24TOYJROW_AVX2 - -#ifdef HAS_RAWTOYJROW_AVX2 -// Convert 32 RAW pixels (128 bytes) to 32 YJ values. -void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - // Row buffer for intermediate ARGB pixels. - SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#ifdef HAS_RAWTOARGBROW_AVX2 - RAWToARGBRow_AVX2(src_raw, row, twidth); -#else - RAWToARGBRow_SSSE3(src_raw, row, twidth); -#endif - ARGBToYJRow_AVX2(row, dst_yj, twidth); - src_raw += twidth * 3; - dst_yj += twidth; - width -= twidth; - } -} -#endif // HAS_RAWTOYJROW_AVX2 - #ifdef HAS_RGB24TOYJROW_SSSE3 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 9ed7fce9c..a94dbcf26 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1347,6 +1347,46 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "sub $0x20,%2 \n" \ "jg 1b \n" +#if defined(__x86_64__) +#define RGB24ToY_AVX2(round) \ + "1: \n" \ + "vmovdqu (%0),%%ymm0 \n" \ + "vmovdqu 0x20(%0),%%ymm1 \n" \ + "vmovdqu 0x40(%0),%%ymm2 \n" \ + "vperm2i128 $0x21,%%ymm1,%%ymm0,%%ymm3 \n" \ + "vperm2i128 $0x21,%%ymm2,%%ymm1,%%ymm11 \n" \ + "vpermd %%ymm0,%%ymm8,%%ymm0 \n" \ + "vpermd %%ymm3,%%ymm9,%%ymm1 \n" \ + "vpermd %%ymm2,%%ymm9,%%ymm3 \n" \ + "vpermd %%ymm11,%%ymm8,%%ymm2 \n" \ + "vpshufb %%ymm10,%%ymm0,%%ymm0 \n" \ + "vpshufb %%ymm10,%%ymm1,%%ymm1 \n" \ + "vpshufb %%ymm10,%%ymm2,%%ymm2 \n" \ + "vpshufb %%ymm10,%%ymm3,%%ymm3 \n" \ + "vpsubb %%ymm5,%%ymm0,%%ymm0 \n" \ + "vpsubb %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpsubb %%ymm5,%%ymm2,%%ymm2 \n" \ + "vpsubb %%ymm5,%%ymm3,%%ymm3 \n" \ + "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ + "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ + "lea 0x60(%0),%0 \n" \ + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ + "prefetcht0 1280(%0) \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" \ + "vmovdqu %%ymm0,(%1) \n" \ + "lea 0x20(%1),%1 \n" \ + "sub $0x20,%2 \n" \ + "jg 1b \n" +#endif + // clang-format on #ifdef HAS_ARGBTOYROW_SSSE3 @@ -1433,10 +1473,60 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { #endif #endif // HAS_RGBATOYJROW_AVX2 +void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); + +void RGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); + +#ifdef HAS_ARGBTOYROW_AVX2 +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_AVX2(src_rgb24, dst_yj, width, &kArgbJPEGConstants); +} + +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_AVX2(src_raw, dst_yj, width, &kAbgrJPEGConstants); +} + +void RGB24ToYRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_AVX2(src_rgb24, dst_y, width, &kArgbI601Constants); +} + +void RAWToYRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_AVX2(src_raw, dst_y, width, &kAbgrI601Constants); +} +#endif + +#if defined(HAS_ARGBTOYROW_NEON) +void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants); +} + +void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants); +} + +void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants); +} + +void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants); +} +#endif + #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBTOUV444ROW_AVX2) || \ defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#if defined(__x86_64__) +static const lvec32 kPermdRGB24_0_AVX2 = {0, 1, 2, 0, 3, 4, 5, 0}; +static const lvec32 kPermdRGB24_1_AVX2 = {2, 3, 4, 0, 5, 6, 7, 0}; +#endif #endif #ifdef HAS_ARGBTOYROW_SSSE3 @@ -1491,6 +1581,40 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } + +#if defined(__x86_64__) +void RGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $15,%%ymm5,%%ymm5 \n" + "vpacksswb %%ymm5,%%ymm5,%%ymm5 \n" + "vbroadcastf128 0(%3),%%ymm4 \n" + "vbroadcastf128 0x60(%3),%%ymm7 \n" + "vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n" + "vphaddw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsubw %%ymm6,%%ymm7,%%ymm7 \n" + "vmovdqa %4,%%ymm6 \n" + "vmovdqa %5,%%ymm8 \n" + "vmovdqa %6,%%ymm9 \n" + "vbroadcastf128 %7,%%ymm10 \n" + LABELALIGN "" + RGB24ToY_AVX2(ymm7) + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c), // %3 + "m"(kPermdARGBToY_AVX), // %4 + "m"(kPermdRGB24_0_AVX2),// %5 + "m"(kPermdRGB24_1_AVX2),// %6 + "m"(kShuffleMaskRGB24ToARGB) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", "xmm11"); +} +#endif #endif #if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW) diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 9560c1123..111c5916a 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -1706,6 +1706,78 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { } #endif // HAS_ARGBTOAR30ROW_AVX2 +#ifdef HAS_RGBTOYMATRIXROW_AVX2 +TEST_F(LibYUVConvertTest, RGBToYMatrixRow_Opt) { + const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; + align_buffer_page_end(src, kPixels * 3); + align_buffer_page_end(dst_opt, kPixels); + align_buffer_page_end(dst_c, kPixels); + MemRandomize(src, kPixels * 3); + memset(dst_opt, 0, kPixels); + memset(dst_c, 1, kPixels); + + // We test with kArgbI601Constants since it's commonly used. + // We use ARGBToYMatrixRow_C for the C reference because we adapted + // ARGBToYMatrixRow_AVX2 to read 24-bit values. But wait, ARGBToYMatrixRow_C + // expects ARGB format (4 bytes). + // I need to create a simple reference loop or use a C function. + // I will just convert the RGB24 to ARGB32 and then call ARGBToYMatrixRow_C. + align_buffer_page_end(src_argb, kPixels * 4); + RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1); + ARGBToYMatrixRow_C(src_argb, dst_c, kPixels, &kArgbI601Constants); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + RGBToYMatrixRow_AVX2(src, dst_opt, kPixels, &kArgbI601Constants); + } else { + ARGBToYMatrixRow_C(src_argb, dst_opt, kPixels, &kArgbI601Constants); + } + } + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_opt[i], dst_c[i]); + } + + free_aligned_buffer_page_end(src_argb); + free_aligned_buffer_page_end(src); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(dst_c); +} + +TEST_F(LibYUVConvertTest, RGBToYMatrixRow_2Step_Opt) { + const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; + align_buffer_page_end(src, kPixels * 3); + align_buffer_page_end(src_argb, kPixels * 4); + align_buffer_page_end(dst_opt, kPixels); + align_buffer_page_end(dst_c, kPixels); + MemRandomize(src, kPixels * 3); + memset(dst_opt, 0, kPixels); + memset(dst_c, 1, kPixels); + + RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1); + ARGBToYMatrixRow_C(src_argb, dst_c, kPixels, &kArgbI601Constants); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1); + ARGBToYMatrixRow_AVX2(src_argb, dst_opt, kPixels, &kArgbI601Constants); + } else { + RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1); + ARGBToYMatrixRow_C(src_argb, dst_opt, kPixels, &kArgbI601Constants); + } + } + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_opt[i], dst_c[i]); + } + + free_aligned_buffer_page_end(src_argb); + free_aligned_buffer_page_end(src); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(dst_c); +} +#endif // HAS_RGBTOYMATRIXROW_AVX2 + #ifdef HAS_ABGRTOAR30ROW_AVX2 TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.