From 893eacf9b4497cc07ff1febddcda02db2482016a Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 13 Apr 2026 16:07:49 -0700 Subject: [PATCH] ARGBToY for AVX512 - add ARGBToYMatrixRow_AVX512BW - refactor SSE and AVX to use Matrix functions, making old functions call the new ones. Zen5 1280x720 Was AVX2 LibYUVConvertTest.ARGBToI444_Opt (1125 ms) Now AVX512 LibYUVConvertTest.ARGBToI444_Opt (641 ms) Details by Gemini: 1. Created 3 new Matrix functions: Added ARGBToYMatrixRow_SSSE3, ARGBToYMatrixRow_AVX2, and ARGBToYMatrixRow_AVX512BW to source/row_gcc.cc. These take the const struct ArgbConstants* c parameter similarly to ARGBToUV444MatrixRow_*. The x86 vector instructions dynamically calculate the needed values using the properties of the constants struct, including using vpmaddwd inside the AVX512 code to offset the lack of a native vphaddw. 2. Replaced Old Functions with Wrappers: Modified the existing implementations of ARGBToYRow_SSSE3, ARGBToYJRow_SSSE3, ABGRToYRow_SSSE3, ABGRToYJRow_SSSE3, RGBAToYRow_SSSE3, RGBAToYJRow_SSSE3, BGRAToYRow_SSSE3 (and their _AVX2 equivalents) in source/row_gcc.cc to act as inline wrappers calling the new ARGBToYMatrixRow_* functions, passing the right matrix parameters (e.g. &kArgbI601Constants, &kArgbJPEGConstants, &kAbgrI601Constants). 3. Added row_any.cc Handlers: Added ANY11MC definitions to source/row_any.cc to autogenerate ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_Any_AVX2, and ARGBToYMatrixRow_Any_AVX512BW which safely handles non-aligned tails. 4. Updated include/libyuv/row.h: Updated the headers with the proper void declarations for all newly generated Matrix and Any_ variants. Also defined HAS_ARGBTOYROW_AVX512BW in the CPU macros. 5. Tested the Implementations: Compiled and tested on Linux x86, which resulted in all tests passing cleanly. Also successfully completed all Windows 32-bit build checks ensuring 32-bit regression prevention without issues. Bug: 477295731 Change-Id: I4f5eec9a961e24a9d760d0a1c0810fb5e29a0bd1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7759494 Reviewed-by: Dale Curtis Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/row.h | 47 ++++ include/libyuv/version.h | 2 +- source/convert.cc | 48 ++++ source/convert_from_argb.cc | 40 ++++ source/planar_functions.cc | 8 + source/row_any.cc | 36 +++ source/row_gcc.cc | 430 ++++++++++++++++++------------------ 8 files changed, 390 insertions(+), 223 deletions(-) diff --git a/README.chromium b/README.chromium index fb002240e..e26ca693d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1924 +Version: 1925 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9c11f3199..36d4038c4 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -381,6 +381,7 @@ extern "C" { #define HAS_I422TOARGBROW_AVX512BW #define HAS_ARGBTOUV444ROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW +#define HAS_ARGBTOYROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW #endif @@ -1746,19 +1747,31 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); +void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); +void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width); +void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); @@ -2149,6 +2162,31 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_Any_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); + void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2204,6 +2242,15 @@ void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); +void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 039324bdd..cb8f2b4de 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1924 +#define LIBYUV_VERSION 1925 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index fbc0ea26e..90941c939 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2091,6 +2091,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYRow = ARGBToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYRow = ARGBToYRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; @@ -2522,6 +2530,14 @@ int BGRAToI420(const uint8_t* src_bgra, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + BGRAToYRow = BGRAToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + BGRAToYRow = BGRAToYRow_AVX512BW; + } + } +#endif #if defined(HAS_BGRATOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { BGRAToUVRow = BGRAToUVRow_Any_AVX2; @@ -2621,6 +2637,14 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToYRow = ABGRToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToYRow = ABGRToYRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVRow = ABGRToUVRow_Any_AVX2; @@ -2752,6 +2776,22 @@ int RGBAToI420(const uint8_t* src_rgba, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGBAToYRow = RGBAToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBAToYRow = RGBAToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGBAToYRow = RGBAToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGBAToYRow = RGBAToYRow_AVX512BW; + } + } +#endif #if defined(HAS_RGBATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGBAToUVRow = RGBAToUVRow_Any_SSSE3; @@ -3125,6 +3165,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYJRow = ARGBToYJRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index d3353ee79..c164693ff 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -124,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYRow = ARGBToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYRow = ARGBToYRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -1083,6 +1091,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToYRow = ABGRToYRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToYRow = ABGRToYRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVRow = ABGRToUVRow_Any_AVX2; @@ -2710,6 +2726,14 @@ int ARGBToJ444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYJRow = ARGBToYJRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; @@ -3171,6 +3195,14 @@ int RGBAToJ400(const uint8_t* src_rgba, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGBAToYJRow = RGBAToYJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGBAToYJRow = RGBAToYJRow_AVX512BW; + } + } +#endif #if defined(HAS_RGBATOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYJRow = RGBAToYJRow_Any_NEON; @@ -3268,6 +3300,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToYJRow = ABGRToYJRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index a57e9a071..ba29620bb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4777,6 +4777,14 @@ static int ARGBSobelize(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToYJRow = ARGBToYJRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 98161605f..a4ba290dc 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1070,11 +1070,29 @@ ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) #ifdef HAS_ARGBTOYROW_SSSE3 ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(ARGBToYRow_Any_AVX512BW, ARGBToYRow_AVX512BW, 0, 4, 1, 63) +#endif #ifdef HAS_BGRATOYROW_SSSE3 ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(BGRAToYRow_Any_AVX512BW, BGRAToYRow_AVX512BW, 0, 4, 1, 63) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11(BGRAToYRow_Any_AVX2, BGRAToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(RGBAToYRow_Any_AVX512BW, RGBAToYRow_AVX512BW, 0, 4, 1, 63) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11(RGBAToYRow_Any_AVX2, RGBAToYRow_AVX2, 0, 4, 1, 31) +#endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(ABGRToYRow_Any_AVX512BW, ABGRToYRow_AVX512BW, 0, 4, 1, 63) +#endif #ifdef HAS_YUY2TOYROW_SSE2 ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) @@ -1082,12 +1100,21 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #ifdef HAS_ARGBTOYJROW_SSSE3 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(ARGBToYJRow_Any_AVX512BW, ARGBToYJRow_AVX512BW, 0, 4, 1, 63) +#endif #ifdef HAS_ABGRTOYJROW_SSSE3 ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(ABGRToYJRow_Any_AVX512BW, ABGRToYJRow_AVX512BW, 0, 4, 1, 63) +#endif #ifdef HAS_RGBATOYJROW_SSSE3 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11(RGBAToYJRow_Any_AVX512BW, RGBAToYJRow_AVX512BW, 0, 4, 1, 63) +#endif #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) #endif @@ -2282,6 +2309,15 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ } +#ifdef HAS_ARGBTOYROW_SSSE3 +ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15) +#endif +#ifdef HAS_ARGBTOYROW_AVX2 +ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31) +#endif +#ifdef HAS_ARGBTOYROW_AVX512BW +ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63) +#endif #ifdef HAS_ARGBTOYMATRIXROW_NEON ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index dc4957a45..e1db3952b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -24,37 +24,21 @@ extern "C" { #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, - 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; // JPeg full range. static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; -static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u, - 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u}; -static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, - 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) // Constants for BGRA -static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, - 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; // Constants for ABGR -static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, - 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; // Constants for RGBA. -static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, - 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; // 126 (7e) - (-109..110) = 16..235 -static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, - 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; -static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; @@ -1367,22 +1351,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" // - LABELALIGN "" // - RGBTOY(xmm7) // - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbI601Constants); } #endif // HAS_ARGBTOYROW_SSSE3 @@ -1390,198 +1361,203 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" // - - LABELALIGN "" // - RGBTOY(xmm7) // - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbJPEGConstants); } #endif // HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_ABGRTOYJROW_SSSE3 // Convert 16 ABGR pixels (64 bytes) to 16 YJ values. // Same as ABGRToYRow but different coefficients, no add 16. -void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" // - - LABELALIGN "" // - RGBTOY(xmm7) // - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void ABGRToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrJPEGConstants); } #endif // HAS_ABGRTOYJROW_SSSE3 #ifdef HAS_RGBATOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. -void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" // - - LABELALIGN "" // - RGBTOY(xmm7) // - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void RGBAToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaJPEGConstants); } #endif // HAS_RGBATOYJROW_SSSE3 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif #ifdef HAS_ARGBTOYROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. +#ifdef HAS_ARGBTOYROW_AVX2 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" // - - LABELALIGN "" // - RGBTOY_AVX2(ymm7) // - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbI601Constants); } +#endif #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ABGRTOYROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. -void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" // - - LABELALIGN "" // - RGBTOY_AVX2(ymm7) // - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +#ifdef HAS_ARGBTOYROW_AVX2 +void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrI601Constants); } +#endif #endif // HAS_ABGRTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. +#ifdef HAS_ARGBTOYROW_AVX2 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" // - - LABELALIGN "" // - RGBTOY_AVX2(ymm7) // - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbJPEGConstants); } +#endif #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ABGRTOYJROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. -void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" // - - LABELALIGN "" // - RGBTOY_AVX2(ymm7) // - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0), // %5 - "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +#ifdef HAS_ARGBTOYROW_AVX2 +void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrJPEGConstants); } +#endif #endif // HAS_ABGRTOYJROW_AVX2 #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vbroadcastf128 %4,%%ymm5 \n" - "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" // +#ifdef HAS_ARGBTOYROW_AVX2 +void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaJPEGConstants); +} +#endif +#endif // HAS_RGBATOYJROW_AVX2 - LABELALIGN "" // - RGBTOY_AVX2(ymm7) // - "vzeroupper \n" - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToYJ), // %3 - "m"(kSub128), // %4 - "m"(kAddY0), // %5 - "m"(kPermdARGBToY_AVX) // %6 +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBTOUV444ROW_AVX2) || \ + defined(HAS_ARGBEXTRACTALPHAROW_AVX2) +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; +#endif + +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { + asm volatile( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psllw $15,%%xmm5 \n" + "packsswb %%xmm5,%%xmm5 \n" + "movdqa 0(%3),%%xmm4 \n" + "movdqa 0x60(%3),%%xmm7 \n" + "movdqa %%xmm4,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "phaddw %%xmm6,%%xmm6 \n" + "psubw %%xmm6,%%xmm7 \n" + LABELALIGN "" + RGBTOY(xmm7) + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } -#endif // HAS_RGBATOYJROW_AVX2 +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 +void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { + asm volatile( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsllw $15,%%ymm5,%%ymm5 \n" + "vpacksswb %%ymm5,%%ymm5,%%ymm5 \n" + "vbroadcastf128 0(%3),%%ymm4 \n" + "vbroadcastf128 0x60(%3),%%ymm7 \n" + "vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n" + "vphaddw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsubw %%ymm6,%%ymm7,%%ymm7 \n" + "vmovdqa %4,%%ymm6 \n" + LABELALIGN "" + RGBTOY_AVX2(ymm7) + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c), // %3 + "m"(kPermdARGBToY_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) +static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { + asm volatile( + "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" + "vpsllw $15,%%zmm16,%%zmm5 \n" + "vpacksswb %%zmm5,%%zmm5,%%zmm5 \n" + "vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1 + "vbroadcasti64x4 0(%3),%%zmm4 \n" + "vbroadcasti64x4 0x60(%3),%%zmm7 \n" + "vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" + "vpackssdw %%zmm6,%%zmm6,%%zmm6 \n" + "vpsubw %%zmm6,%%zmm7,%%zmm7 \n" + "vmovups %4,%%zmm6 \n" + LABELALIGN + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "vmovups 0x80(%0),%%zmm2 \n" + "vmovups 0xc0(%0),%%zmm3 \n" + "vpsubb %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubb %%zmm5,%%zmm1,%%zmm1 \n" + "vpsubb %%zmm5,%%zmm2,%%zmm2 \n" + "vpsubb %%zmm5,%%zmm3,%%zmm3 \n" + "vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n" + "vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n" + "vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n" + "vpmaddubsw %%zmm3,%%zmm4,%%zmm3 \n" + "lea 0x100(%0),%0 \n" + "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" + "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" + "vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" + "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" + "vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n" + "vpackssdw %%zmm3,%%zmm2,%%zmm2 \n" + "vpaddw %%zmm7,%%zmm0,%%zmm0 \n" + "vpaddw %%zmm7,%%zmm2,%%zmm2 \n" + "vpsrlw $0x8,%%zmm0,%%zmm0 \n" + "vpsrlw $0x8,%%zmm2,%%zmm2 \n" + "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" + "vpermd %%zmm0,%%zmm6,%%zmm0 \n" + "vmovups %%zmm0,(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c), // %3 + "m"(kPermdARGBToY_AVX512BW) // %4 + : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm16"); +} +#endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 - void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1724,8 +1700,6 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #endif // HAS_ARGBTOUV444ROW_AVX2 #ifdef HAS_ARGBTOUV444ROW_AVX512BW -static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_u, @@ -1977,6 +1951,62 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_SSSE3 + +#ifdef HAS_ARGBTOYROW_AVX2 +void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaI601Constants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX2 +void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kBgraI601Constants); +} +#endif + + +#ifdef HAS_ARGBTOYROW_AVX512BW +void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbI601Constants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbJPEGConstants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrI601Constants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrJPEGConstants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaI601Constants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaJPEGConstants); +} +#endif + +#ifdef HAS_ARGBTOYROW_AVX512BW +void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kBgraI601Constants); +} +#endif + #ifdef HAS_ARGBTOUV444ROW_AVX2 void ARGBToUV444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_u, @@ -2127,58 +2157,16 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, } #endif // HAS_ABGRTOUVJROW_AVX2 -void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN "" // - RGBTOY(xmm7) - : "+r"(src_bgra), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kBGRAToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void BGRAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kBgraI601Constants); } -void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN "" // - RGBTOY(xmm7) - : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kABGRToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void ABGRToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrI601Constants); } -void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" - - LABELALIGN "" // - RGBTOY(xmm7) - : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "m"(kRGBAToY), // %3 - "m"(kSub128), // %4 - "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); +void RGBAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { + ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaI601Constants); } #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)