From 61354d2671d9b5c73cc964415fe25bc76cea051a Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 10 Feb 2025 18:19:58 -0800 Subject: [PATCH] ARGBToUV Matrix for AVX2 and SSSE3 - Round before shifting to 8 bit to match NEON - RAWToARGB use unaligned loads and port to AVX2 Was C/SSSE/AVX2 ARGBToI444_Opt (343 ms) ARGBToJ444_Opt (677 ms) RAWToI444_Opt (405 ms) RAWToJ444_Opt (803 ms) Now AVX2 ARGBToI444_Opt (283 ms) ARGBToJ444_Opt (284 ms) RAWToI444_Opt (316 ms) RAWToJ444_Opt (339 ms) Profile Now AVX2 38.31% ARGBToUVJ444Row_AVX2 32.31% RAWToARGBRow_AVX2 23.99% ARGBToYJRow_AVX2 Profile Was C/SSSE/AVX2 73.15% ARGBToUVJ444Row_C 15.74% RAWToARGBRow_SSSE3 8.87% ARGBToYJRow_AVX2 Bug: 381138208 Change-Id: I696b2d83435bc985aa38df831e01ff1a658da56e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6231592 Reviewed-by: Wan-Teh Chang Reviewed-by: Ben Weiss Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/row.h | 39 +- include/libyuv/version.h | 2 +- source/convert.cc | 49 ++- source/convert_argb.cc | 8 + source/convert_from_argb.cc | 24 + source/row_any.cc | 12 + source/row_common.cc | 43 +- source/row_gcc.cc | 855 +++++++++++++++++------------------- 9 files changed, 557 insertions(+), 477 deletions(-) diff --git a/README.chromium b/README.chromium index 017031390..594e3db39 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1904 +Version: 1905 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9ee8af68f..74372e1db 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -88,7 +88,6 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 @@ -234,6 +233,8 @@ extern "C" { #define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 +#define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBTOUVJ444ROW_SSSE3 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 @@ -259,13 +260,14 @@ extern "C" { #define HAS_P210TOARGBROW_SSSE3 #define HAS_P410TOAR30ROW_SSSE3 #define HAS_P410TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_AVX2 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 -#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSE41 +#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITXRGBROW_SSE2 #define HAS_SPLITXRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 @@ -298,6 +300,8 @@ extern "C" { #define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_ARGBTOUV444ROW_AVX2 +#define HAS_ARGBTOUVJ444ROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 @@ -332,8 +336,8 @@ extern "C" { #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 #define HAS_RGBATOYJROW_AVX2 -#define HAS_SPLITRGBROW_AVX2 #define HAS_SPLITARGBROW_AVX2 +#define HAS_SPLITRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SPLITXRGBROW_AVX2 #define HAS_SWAPUVROW_AVX2 @@ -2699,6 +2703,33 @@ void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_v, int width); +void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJ444Row_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUV444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUV444Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + +void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJ444Row_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); + void ARGBToUV444Row_C(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -3853,6 +3884,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -3955,6 +3987,7 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e26e427d0..f840f057b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1904 +#define LIBYUV_VERSION 1905 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 6c37143a9..f48df28f2 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3265,6 +3265,14 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -3443,6 +3451,14 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -3559,6 +3575,14 @@ int RAWToI444(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUV444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUV444Row = ARGBToUV444Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; @@ -3669,6 +3693,14 @@ int RAWToI444(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; @@ -3762,7 +3794,6 @@ int RAWToJ444(const uint8_t* src_raw, src_stride_raw = -src_stride_raw; } // TODO: add row coalesce when main loop handles large width in blocks - // TODO: implement UVJ444 or trim the ifdef below #if defined(HAS_ARGBTOUVJ444ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJ444Row = ARGBToUVJ444Row_Any_SSSE3; @@ -3771,6 +3802,14 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUVJ444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTOUVJ444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON; @@ -3881,6 +3920,14 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index d341fa20d..41997fe3b 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3833,6 +3833,14 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index d326641e7..8d2e8d05e 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -60,6 +60,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUV444Row = ARGBToUV444Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; @@ -2445,6 +2453,14 @@ int ARGBToJ444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJ444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJ444Row = ARGBToUVJ444Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTOUVJ444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON; @@ -3630,6 +3646,14 @@ int RAWToJNV21(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index c49ef50bd..85fb6ffb5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1033,6 +1033,9 @@ ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #endif +#if defined(HAS_RAWTOARGBROW_AVX2) +ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) +#endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) #endif @@ -2145,6 +2148,15 @@ ANY12(SplitUVRow_Any_LSX, SplitUVRow_LSX, 0, 2, 0, 31) #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) #endif +#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 +ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15) +#endif +#ifdef HAS_ARGBTOUV444ROW_AVX2 +ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31) +#endif +#ifdef HAS_ARGBTOUVJ444ROW_AVX2 +ANY12(ARGBToUVJ444Row_Any_AVX2, ARGBToUVJ444Row_AVX2, 0, 4, 0, 31) +#endif #ifdef HAS_YUY2TOUV422ROW_AVX2 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) diff --git a/source/row_common.cc b/source/row_common.cc index 5e1551b99..36561e0b7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -47,7 +47,6 @@ extern "C" { #if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 -#define LIBYUV_RGBTOU_TRUNCATE 1 #endif #if defined(LIBYUV_BIT_EXACT) #define LIBYUV_UNATTENUATE_DUP 1 @@ -626,10 +625,16 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, } #ifdef LIBYUV_RGB7 -// Old 7 bit math for compatibility on unsupported platforms. +// Old 7 bit math for Visual C static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16); } +static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); +} +static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); +} #else // 8 bit // Intel SSE/AVX uses the following equivalent formula @@ -640,20 +645,6 @@ static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); } -#endif - -#define AVGB(a, b) (((a) + (b) + 1) >> 1) - -// LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. -#ifdef LIBYUV_RGBTOU_TRUNCATE -static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); -} -static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); -} -#else -// TODO(fbarchard): Add rounding to x86 SIMD and use this static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); } @@ -662,7 +653,9 @@ static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { } #endif -// ARM uses uint16 + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) +// ARM uses uint16. TODO: Make ARM use uint8 to allow dotproduct. #if !defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGBxToU(uint16_t r, uint16_t g, uint16_t b) { return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); @@ -784,6 +777,16 @@ MAKEROWY(RAW, 0, 1, 2, 3) // b -0.08131 * 255 = -20.73405 = -20 // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 +// TODO: consider 256 for fixed point on UV +// JPeg 8 bit U: +// b 0.50000 * 256 = 128.0 = 128 +// g -0.33126 * 256 = −84.80256 = -85 +// r -0.16874 * 256 = −43.19744 = -43 +// JPeg 8 bit V: +// b -0.08131 * 256 = −20.81536 = -21 +// g -0.41869 * 256 = −107.18464 = -107 +// r 0.50000 * 256 = 128.0 = 128 + #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. @@ -4379,13 +4382,17 @@ void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { #endif // HAS_RGB24TOYJROW_AVX2 #ifdef HAS_RAWTOYJROW_AVX2 -// Convert 16 RAW pixels (64 bytes) to 16 YJ values. +// Convert 32 RAW pixels (128 bytes) to 32 YJ values. void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; +#ifdef HAS_RAWTOARGBROW_AVX2 + RAWToARGBRow_AVX2(src_raw, row, twidth); +#else RAWToARGBRow_SSSE3(src_raw, row, twidth); +#endif ARGBToYJRow_AVX2(row, dst_yj, twidth); src_raw += twidth * 3; dst_yj += twidth; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 81f35f861..b8e0b4d3e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -58,12 +58,6 @@ static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0, static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; -static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, - 0, -38, -74, 112, 0, -38, -74, 112}; - -static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, - 0, 112, -94, -18, 0, 112, -94, -18}; - // Constants for ABGR static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; @@ -78,12 +72,6 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; -static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, - 0, 112, -74, -38, 0, 112, -74, -38}; - -static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, - 0, -18, -94, 112, 0, -18, -94, 112}; - static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; @@ -102,12 +90,16 @@ static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, - 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; +static const uvec8 kShuffleMaskRAWToARGB = { + 2u, 1u, 0u, 128u, 5u, 4u, 3u, 128u, 8u, 7u, 6u, 128u, 11u, 10u, 9u, 128u}; +// Shuffle table for converting RAW to ARGB. Last 12 +static const uvec8 kShuffleMaskRAWToARGB_0 = {6u, 5u, 4u, 128u, 9u, 8u, + 7u, 128u, 12u, 11u, 10u, 128u, + 15u, 14u, 13u, 128u}; // Shuffle table for converting RAW to RGBA. -static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u, - 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u}; +static const uvec8 kShuffleMaskRAWToRGBA = { + 128u, 2u, 1u, 0u, 128u, 5u, 4u, 3u, 128u, 8u, 7u, 6u, 128u, 11u, 10u, 9u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { @@ -231,40 +223,81 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 - "pslld $0x18,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" // 0xff000000 + "pslld $0x18,%%xmm6 \n" "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" + "movdqu 12(%0),%%xmm1 \n" + "movdqu 24(%0),%%xmm2 \n" + "movdqu 32(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqu %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "por %%xmm6,%%xmm0 \n" + "por %%xmm6,%%xmm1 \n" + "por %%xmm6,%%xmm2 \n" + "por %%xmm6,%%xmm3 \n" + "movdqu %%xmm0,0x00(%1) \n" "movdqu %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" + "movdqu %%xmm2,0x20(%1) \n" "movdqu %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB), // %3 + "m"(kShuffleMaskRAWToARGB_0) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + asm volatile( + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0xff000000 + "vpslld $0x18,%%ymm6,%%ymm6 \n" + "vbroadcastf128 %3,%%ymm4 \n" // + "vbroadcastf128 %4,%%ymm5 \n" // + + LABELALIGN // + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // first 12 + "vinserti128 $1,12(%0),%%ymm0,%%ymm0 \n" // second 12 + "vmovdqu 24(%0),%%xmm1 \n" // third 12 + "vinserti128 $1,36(%0),%%ymm1,%%ymm1 \n" // forth 12 + "vmovdqu 48(%0),%%xmm2 \n" // fifth 12 + "vinserti128 $1,60(%0),%%ymm2,%%ymm2 \n" // sixth 12 + "vmovdqu 68(%0),%%xmm3 \n" // seventh 12 + "vinserti128 $1,80(%0),%%ymm3,%%ymm3 \n" // eighth 12 + "lea 96(%0),%0 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm1,%%ymm1 \n" + "vpor %%ymm6,%%ymm2,%%ymm2 \n" + "vpor %%ymm6,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "vmovdqu %%ymm3,0x60(%1) \n" + "lea 0x80(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRAWToARGB), // %3 + "m"(kShuffleMaskRAWToARGB_0) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Same code as RAWToARGB with different shuffler and A in low bits @@ -1342,7 +1375,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "prefetcht0 1280(%0) \n" \ "vpaddw %%" #round \ - ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + ",%%ymm0,%%ymm0 \n" /* Add .5 */ \ "vpaddw %%" #round \ ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ @@ -1383,7 +1416,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - LABELALIGN RGBTOY(xmm5) + LABELALIGN // + RGBTOY(xmm5) // : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1443,7 +1477,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + "vmovdqa %6,%%ymm6 \n" // + LABELALIGN RGBTOY_AVX2( ymm7) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 @@ -1464,7 +1499,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + "vmovdqa %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm7) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 @@ -1484,8 +1519,10 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + "vmovdqa %5,%%ymm6 \n" // + LABELALIGN // + RGBTOY_AVX2(ymm5) // + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1503,7 +1540,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + "vmovdqa %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 @@ -1522,7 +1559,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( + "vmovdqa %5,%%ymm6 \n" // + LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 @@ -1534,12 +1572,151 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { } #endif // HAS_RGBATOYJROW_AVX2 +#ifdef HAS_ARGBTOUV444ROW_SSSE3 + +struct RgbUVConstants { + vec8 kRGBToU; + vec8 kRGBToV; +}; + +void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "movdqa %4,%%xmm3 \n" + "movdqa %5,%%xmm4 \n" + "movdqa %6,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm6,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "lea 0x40(%0),%0 \n" + "movdqu %%xmm0,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(rgbuvconstants->kRGBToU), // %4 + "m"(rgbuvconstants->kRGBToV), // %5 + "m"(kAddUV128) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ARGBTOUV444ROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_AVX2 + +void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "vbroadcastf128 %4,%%ymm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + "vbroadcastf128 %6,%%ymm5 \n" + "vmovdqa %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm6 \n" + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm3,%%ymm6,%%ymm6 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vphaddw %%ymm6,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1) \n" + + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm6 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm6,%%ymm6 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vphaddw %%ymm6,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates + "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0,(%1,%2,1) \n" + "lea 0x80(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "m"(rgbuvconstants->kRGBToU), // %4 + "m"(rgbuvconstants->kRGBToV), // %5 + "m"(kAddUV128), // %6 + "m"(kPermdARGBToY_AVX) // %7 + : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", + "ymm7"); +} +#endif // HAS_ARGBTOUV444ROW_AVX2 + #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { + +void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" @@ -1572,33 +1749,181 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" + "subl $0x10,%3 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 +#if defined(__i386__) + "+m"(width) // %3 +#else + "+rm"(width) // %3 +#endif : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToV), // %5 - "m"(kARGBToU), // %6 + "m"(rgbuvconstants->kRGBToU), // %5 + "m"(rgbuvconstants->kRGBToV), // %6 "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + +#endif // HAS_ARGBTOUVROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 + +// RGB to BT601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 + +static const struct RgbUVConstants kARGBI601UVConstants = { + {112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0}, + {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0}}; + +void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, + &kARGBI601UVConstants); +} +#endif // HAS_ARGBTOUV444ROW_SSSE3 + +#ifdef HAS_ARGBTOUV444ROW_AVX2 +void ARGBToUV444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_AVX2(src_argb, dst_u, dst_v, width, + &kARGBI601UVConstants); +} +#endif // HAS_ARGBTOUV444ROW_AVX2 + +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBI601UVConstants); +} + +static const struct RgbUVConstants kABGRI601UVConstants = { + {-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0}, + {112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0}}; + +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRI601UVConstants); +} + +static const struct RgbUVConstants kBGRAI601UVConstants = { + {0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112}, + {0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18}}; + +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, + int src_stride_bgra, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_bgra, src_stride_bgra, dst_u, dst_v, width, + &kBGRAI601UVConstants); +} + +static const struct RgbUVConstants kRGBAI601UVConstants = { + {0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38}, + {0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112}}; + +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, + int src_stride_rgba, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_rgba, src_stride_rgba, dst_u, dst_v, width, + &kRGBAI601UVConstants); } #endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVJ444ROW_SSSE3 +// RGB to JPEG coefficients +// UB 0.500 coefficient = 127 +// UG -0.33126 coefficient = -84 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -20 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 127 + +static const struct RgbUVConstants kARGBJPEGUVConstants = { + {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}, + {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, + 0}}; + +void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, + &kARGBJPEGUVConstants); +} +#endif // HAS_ARGBTOUVJ444ROW_SSSE3 + +#ifdef HAS_ARGBTOUVJ444ROW_AVX2 +void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_AVX2(src_argb, dst_u, dst_v, width, + &kARGBJPEGUVConstants); +} +#endif // HAS_ARGBTOUVJ444ROW_AVX2 + +#ifdef HAS_ARGBTOUVJROW_SSSE3 +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBJPEGUVConstants); +} + +static const struct RgbUVConstants kABGRJPEGUVConstants = { + {-43, -84, 127, 0, -43, -84, 127, 0, -43, -84, 127, 0, -43, -84, 127, 0}, + {127, -107, -20, 0, 127, -107, -20, 0, 127, -107, -20, 0, 127, -107, -20, + 0}}; + +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_SSSE3(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRJPEGUVConstants); +} +#endif // HAS_ABGRTOUVJROW_SSSE3 + #if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) // vpshufb for vphaddw + vpackuswb packed to shorts. @@ -1643,12 +1968,13 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" @@ -1706,12 +2032,13 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" - "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" @@ -1862,196 +2189,6 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, } #endif // HAS_ABGRTOUVJROW_AVX2 -#ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kARGBToVJ), // %5 - "m"(kARGBToUJ), // %6 - "m"(kSub128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} -#endif // HAS_ARGBTOUVJROW_SSSE3 - -#ifdef HAS_ABGRTOUVJROW_SSSE3 -void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToVJ), // %5 - "m"(kABGRToUJ), // %6 - "m"(kSub128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} -#endif // HAS_ABGRTOUVJROW_SSSE3 - -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %4,%%xmm3 \n" - "movdqa %5,%%xmm4 \n" - "movdqa %6,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "pmaddubsw %%xmm3,%%xmm0 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm6,%%xmm2 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "packsswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "m"(kARGBToV), // %4 - "m"(kARGBToU), // %5 - "m"(kAddUV128) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); -} -#endif // HAS_ARGBTOUV444ROW_SSSE3 - void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" @@ -2069,69 +2206,6 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "xmm7"); } -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, - int src_stride_bgra, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_bgra)), // %4 - "m"(kBGRAToV), // %5 - "m"(kBGRAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" @@ -2166,132 +2240,6 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "xmm7"); } -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kABGRToV), // %5 - "m"(kABGRToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, - int src_stride_rgba, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "movdqa %5,%%xmm3 \n" - "movdqa %6,%%xmm4 \n" - "movdqa %7,%%xmm5 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_rgba)), // %4 - "m"(kRGBAToV), // %5 - "m"(kRGBAToU), // %6 - "m"(kAddUV128) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); -} - #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 @@ -2522,6 +2470,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, #else #define YUVTORGB_SETUP(yuvconstants) + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ "pcmpeqb %%xmm0,%%xmm0 \n" \