diff --git a/README.chromium b/README.chromium index 594e3db39..fb8509991 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1905 +Version: 1906 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 31113c0e2..ec0e331cf 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -20,8 +20,9 @@ extern "C" { #endif // The following are available for Visual C and GCC: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) +#if !defined(LIBYUV_DISABLE_X86) && \ + ((defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ + defined(__i386__) || defined(_M_IX86)) #define HAS_HASHDJB2_SSE41 #define HAS_SUMSQUAREERROR_SSE2 #define HAS_HAMMINGDISTANCE_SSE42 @@ -36,13 +37,16 @@ extern "C" { #endif // The following are available for GCC and clangcl: -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_HAMMINGDISTANCE_SSSE3 #endif // The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ - (defined(__x86_64__) || defined(__i386__)) + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_HAMMINGDISTANCE_AVX2 #endif diff --git a/include/libyuv/cpu_support.h b/include/libyuv/cpu_support.h index 168764b27..9304ab2ac 100644 --- a/include/libyuv/cpu_support.h +++ b/include/libyuv/cpu_support.h @@ -44,21 +44,24 @@ extern "C" { #endif // __clang__ // GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if defined(__GNUC__) && !defined(LIBYUV_ENABLE_ROWWIN) && \ + (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) #define GCC_HAS_AVX2 1 #endif // GNUC >= 4.7 #endif // __GNUC__ // clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if defined(__clang__) && !defined(LIBYUV_ENABLE_ROWWIN) && \ + (defined(__x86_64__) || defined(__i386__)) #if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) #define CLANG_HAS_AVX2 1 #endif // clang >= 3.4 #endif // __clang__ // clang >= 6.0.0 required for AVX512. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if defined(__clang__) && !defined(LIBYUV_ENABLE_ROWWIN) && \ + (defined(__x86_64__) || defined(__i386__)) // clang in xcode follows a different versioning scheme. // TODO(fbarchard): fix xcode 9 ios b/789. #if (__clang_major__ >= 7) && !defined(__APPLE__) @@ -67,8 +70,9 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ - _MSC_VER >= 1700 +#if defined(_M_IX86) && \ + (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 5b79efffc..dbf51de52 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -38,8 +38,10 @@ extern "C" { #endif #endif // The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ + defined(__i386__)) #define HAS_ARGBAFFINEROW_SSE2 #endif diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index bbf217f0a..c00d83c69 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -20,6 +20,7 @@ extern "C" { #endif // The following are available for Visual C 32 bit: +// TODO - port to clangcl on rotate_win #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ !defined(__clang__) #define HAS_TRANSPOSEWX8_SSSE3 @@ -27,14 +28,17 @@ extern "C" { #endif // The following are available for GCC 32 or 64 bit: -#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || defined(__x86_64__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSE4X4_32_SSE2 #define HAS_TRANSPOSE4X4_32_AVX2 #endif // The following are available for 64 bit GCC: -#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_TRANSPOSEWX8_FAST_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 74372e1db..e911eddd5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -23,10 +23,11 @@ extern "C" { #endif // The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ + defined(__i386__)) // Conversions: -#define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBEXTRACTALPHAROW_SSE2 @@ -38,9 +39,6 @@ extern "C" { #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2 -#define HAS_ARGBTOYJROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 -#define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 @@ -68,13 +66,8 @@ extern "C" { #define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 -#define HAS_RAWTOYJROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 -#define HAS_RGB24TOYJROW_SSSE3 -#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 -#define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_ERMS #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 @@ -88,7 +81,6 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_SSSE3 -#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 @@ -124,13 +116,24 @@ extern "C" { // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. -#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ - defined(_MSC_VER) +#if (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I444ALPHATOARGBROW_SSSE3 #endif +#if (defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ + defined(_MSC_VER)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) +// TODO(fbarchard): fix build error on android_full_debug=1 +// https://code.google.com/p/libyuv/issues/detail?id=517 +// TODO(fbarchard): fix LIBYUV_ENABLE_ROWWIN with clang +#define HAS_I422ALPHATOARGBROW_AVX2 +#define HAS_I444ALPHATOARGBROW_AVX2 +#endif + #endif // The following are available on all x86 platforms, but @@ -145,8 +148,6 @@ extern "C" { #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 @@ -167,8 +168,6 @@ extern "C" { #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB24ROW_AVX2 -#define HAS_RAWTOYJROW_AVX2 -#define HAS_RGB24TOYJROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -179,10 +178,6 @@ extern "C" { #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 // #define HAS_HALFFLOATROW_F16C // Enable to test half float cast -#if !defined(LIBYUV_BIT_EXACT) -#define HAS_ARGBTOUVJROW_AVX2 -#define HAS_ARGBTOUVROW_AVX2 -#endif // Effects: #define HAS_ARGBADDROW_AVX2 @@ -190,14 +185,6 @@ extern "C" { // #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 - -#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ - defined(_MSC_VER) -// TODO(fbarchard): fix build error on android_full_debug=1 -// https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I422ALPHATOARGBROW_AVX2 -#define HAS_I444ALPHATOARGBROW_AVX2 -#endif #endif // The following are available for AVX2 Visual C 32 bit: @@ -224,7 +211,10 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) +#define HAS_RAWTOYJROW_SSSE3 #define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ABGRTOYJROW_SSSE3 @@ -272,8 +262,20 @@ extern "C" { #define HAS_SPLITXRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 #define HAS_YUY2TONVUVROW_SSE2 +// TODO: port row_win to use 8 bit coefficients. +#define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 +#define HAS_BGRATOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 +#define HAS_RGB24TOYJROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 + #if !defined(LIBYUV_BIT_EXACT) +// TODO: adjust row_win to use 8 bit negative coefficients. #define HAS_ABGRTOUVJROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 #endif #if defined(__x86_64__) || !defined(__pic__) @@ -286,9 +288,15 @@ extern "C" { // The following are available for AVX2 gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) +#define HAS_RAWTOYJROW_AVX2 +#define HAS_RGB24TOYJROW_AVX2 + +#define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 #define HAS_AB64TOARGBROW_AVX2 #define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOYJROW_AVX2 @@ -345,6 +353,8 @@ extern "C" { #if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVJROW_AVX2 #define HAS_ABGRTOUVROW_AVX2 +#define HAS_ARGBTOUVJROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 #endif #if defined(__x86_64__) || !defined(__pic__) @@ -358,8 +368,9 @@ extern "C" { // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_COPYROW_AVX512BW #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index f5294f082..ccee9ada4 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -21,8 +21,10 @@ extern "C" { #endif // The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(LIBYUV_ENABLE_ROWWIN)) || \ + defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 #define HAS_SCALEADDROW_SSE2 @@ -41,7 +43,9 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_SCALEUVROWDOWN2BOX_SSSE3 #define HAS_SCALEROWUP2_LINEAR_SSE2 #define HAS_SCALEROWUP2_LINEAR_SSSE3 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f840f057b..1761ab423 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1905 +#define LIBYUV_VERSION 1906 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 83237ff38..33a725e58 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -19,7 +19,9 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) // "memory" clobber prevents the reads from being removed diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 36c5e575c..756f83cb3 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; const uint32_t c16 = 0x92d9e201; // 33^16 uint32_t tmp, tmp2; - asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" // count is always a multiple of 16. diff --git a/source/compare_win.cc b/source/compare_win.cc index 9bb27f1dd..9d5bb27cd 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -23,8 +23,8 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ + (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, diff --git a/source/convert.cc b/source/convert.cc index f48df28f2..dbe785d36 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -4034,16 +4034,22 @@ int RGB565ToI420(const uint8_t* src_rgb565, // Neon version does direct RGB565 to YUV. #if defined(HAS_RGB565TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGB565ToUVRow = RGB565ToUVRow_Any_NEON; RGB565ToYRow = RGB565ToYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { RGB565ToYRow = RGB565ToYRow_NEON; + } + } +#endif +// Neon version does direct RGB565 to YUV. +#if defined(HAS_RGB565TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { RGB565ToUVRow = RGB565ToUVRow_NEON; } } +#endif // MSA version does direct RGB565 to YUV. -#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \ - defined(HAS_RGB565TOYROW_LASX)) #if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToUVRow = RGB565ToUVRow_Any_MSA; @@ -4075,7 +4081,6 @@ int RGB565ToI420(const uint8_t* src_rgb565, } #endif // Other platforms do intermediate conversion from RGB565 to ARGB. -#else #if defined(HAS_RGB565TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; @@ -4100,14 +4105,6 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; - } - } -#endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; @@ -4116,6 +4113,14 @@ int RGB565ToI420(const uint8_t* src_rgb565, } } #endif +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } + } +#endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; @@ -4123,7 +4128,6 @@ int RGB565ToI420(const uint8_t* src_rgb565, ARGBToUVRow = ARGBToUVRow_AVX2; } } -#endif #endif { #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ @@ -4214,18 +4218,22 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, // Neon version does direct ARGB1555 to YUV. #if defined(HAS_ARGB1555TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToYRow = ARGB1555ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; - } } } +#endif +#if defined(HAS_ARGB1555TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } +#endif + // MSA version does direct ARGB1555 to YUV. -#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \ - defined(HAS_ARGB1555TOYROW_LASX)) #if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; @@ -4256,8 +4264,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } } #endif + // Other platforms do intermediate conversion from ARGB1555 to ARGB. -#else #if defined(HAS_ARGB1555TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; @@ -4305,7 +4313,6 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGBToUVRow = ARGBToUVRow_AVX2; } } -#endif #endif { #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ @@ -4398,17 +4405,20 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, // Neon version does direct ARGB4444 to YUV. #if defined(HAS_ARGB4444TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToYRow = ARGB4444ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; - } } } -// Other platforms do intermediate conversion from ARGB4444 to ARGB. -#else +#endif +#if defined(HAS_ARGB4444TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } +#endif #if defined(HAS_ARGB4444TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; @@ -4520,7 +4530,6 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, ARGBToUVRow = ARGBToUVRow_LASX; } } -#endif #endif { diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index e07bedfa7..ae7436b12 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -17,7 +17,9 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) diff --git a/source/rotate_win.cc b/source/rotate_win.cc index a78873f84..03eeee3a6 100644 --- a/source/rotate_win.cc +++ b/source/rotate_win.cc @@ -17,8 +17,8 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ + (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, diff --git a/source/row_common.cc b/source/row_common.cc index 057c3bb9f..7101ec321 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -36,14 +36,6 @@ extern "C" { // LIBYUV_UNLIMITED_BT709 // LIBYUV_UNLIMITED_BT2020 -// The following macro from row_win makes the C code match the row_win code, -// which is 7 bit fixed point for ARGBToI420: -#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ - defined(_MSC_VER) && !defined(__clang__) && \ - (defined(_M_IX86) || defined(_M_X64)) -#define LIBYUV_RGB7 1 -#endif - #if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 @@ -623,11 +615,21 @@ void AR64ShuffleRow_C(const uint8_t* src_ar64, dst_ar64_16 += 4; } } +// BT601 8 bit Y: +// b 0.114 * 219 = 24.966 = 25 +// g 0.587 * 219 = 128.553 = 129 +// r 0.299 * 219 = 65.481 = 66 +// BT601 8 bit U: +// b 0.875 * 128 = 112.0 = 112 +// g -0.5781 * 128 = −73.9968 = -74 +// r -0.2969 * 128 = −38.0032 = -38 +// BT601 8 bit V: +// b -0.1406 * 128 = −17.9968 = -18 +// g -0.7344 * 128 = −94.0032 = -94 +// r 0.875 * 128 = 112.0 = 112 -#ifdef LIBYUV_RGB7 -// Old 7 bit math for Visual C static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, ((33 * r + 65 * g + 13 * b) >> 7) + 16); + return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); } static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8000) >> 8); @@ -635,37 +637,10 @@ static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8000) >> 8); } -#else -// 8 bit -// Intel SSE/AVX uses the following equivalent formula -// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. -// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + -// 0x7e80) >> 8; - -static __inline uint8_t RGBToY(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (66 * r + 129 * g + 25 * b + 0x1080) >> 8); -} -static __inline uint8_t RGBToU(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); -} -static __inline uint8_t RGBToV(uint8_t r, uint8_t g, uint8_t b) { - return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); -} -#endif - #define AVGB(a, b) (((a) + (b) + 1) >> 1) -// ARM uses uint16. TODO: Make ARM use uint8 to allow dotproduct. -#if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline int RGBxToU(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST(uint8_t, (112 * b - 74 * g - 38 * r + 0x8080) >> 8); -} -static __inline int RGBxToV(uint16_t r, uint16_t g, uint16_t b) { - return STATIC_CAST(uint8_t, (112 * r - 94 * g - 18 * b + 0x8080) >> 8); -} -#endif // ARGBToY_C and ARGBToUV_C -// Intel version mimic SSE/AVX which does 2 pavgb +// Intel version of UV mimic SSE/AVX which does 2 pavgb #if defined(LIBYUV_ARGBTOUV_PAVGB) #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ @@ -718,28 +693,28 @@ static __inline int RGBxToV(uint16_t r, uint16_t g, uint16_t b) { const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 2) >> \ - 2; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 2) >> \ - 2; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 2) >> \ - 2; \ - dst_u[0] = RGBxToU(ar, ag, ab); \ - dst_v[0] = RGBxToV(ar, ag, ab); \ + uint8_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 2) >> \ + 2; \ + uint8_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 2) >> \ + 2; \ + uint8_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 2) >> \ + 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ - uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1; \ - uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1; \ - uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1; \ - dst_u[0] = RGBxToU(ar, ag, ab); \ - dst_v[0] = RGBxToV(ar, ag, ab); \ + uint8_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1; \ + uint8_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1; \ + uint8_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ } \ } #endif @@ -752,32 +727,15 @@ MAKEROWY(RGB24, 2, 1, 0, 3) MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY -// JPeg uses a variation on BT.601-1 full range +// JPeg uses BT.601-1 full range // y = 0.29900 * r + 0.58700 * g + 0.11400 * b // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center -// BT.601 Mpeg range uses: -// b 0.1016 * 255 = 25.908 = 25 -// g 0.5078 * 255 = 129.489 = 129 -// r 0.2578 * 255 = 65.739 = 66 -// JPeg 7 bit Y (deprecated) -// b 0.11400 * 128 = 14.592 = 15 -// g 0.58700 * 128 = 75.136 = 75 -// r 0.29900 * 128 = 38.272 = 38 // JPeg 8 bit Y: // b 0.11400 * 256 = 29.184 = 29 // g 0.58700 * 256 = 150.272 = 150 // r 0.29900 * 256 = 76.544 = 77 // JPeg 8 bit U: -// b 0.50000 * 255 = 127.5 = 127 -// g -0.33126 * 255 = -84.4713 = -84 -// r -0.16874 * 255 = -43.0287 = -43 -// JPeg 8 bit V: -// b -0.08131 * 255 = -20.73405 = -20 -// g -0.41869 * 255 = -106.76595 = -107 -// r 0.50000 * 255 = 127.5 = 127 -// TODO: consider 256 for fixed point on UV -// JPeg 8 bit U: // b 0.50000 * 256 = 128.0 = 128 // g -0.33126 * 256 = −84.80256 = -85 // r -0.16874 * 256 = −43.19744 = -43 @@ -786,32 +744,16 @@ MAKEROWY(RAW, 0, 1, 2, 3) // g -0.41869 * 256 = −107.18464 = -107 // r 0.50000 * 256 = 128.0 = 128 -#ifdef LIBYUV_RGB7 -// Old 7 bit math for compatibility on unsupported platforms. -static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { - return (38 * r + 75 * g + 15 * b + 64) >> 7; -} -#else // 8 bit static __inline uint8_t RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (77 * r + 150 * g + 29 * b + 128) >> 8; } -#endif - static __inline uint8_t RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; + return (128 * b - 85 * g - 43 * r + 0x8000) >> 8; } static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { - return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; + return (128 * r - 107 * g - 21 * b + 0x8000) >> 8; } -#if !defined(LIBYUV_ARGBTOUV_PAVGB) -static __inline uint8_t RGBxToUJ(uint16_t r, uint16_t g, uint16_t b) { - return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; -} -static __inline uint8_t RGBxToVJ(uint16_t r, uint16_t g, uint16_t b) { - return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; -} -#endif // ARGBToYJ_C and ARGBToUVJ_C // Intel version mimic SSE/AVX which does 2 pavgb @@ -867,17 +809,17 @@ static __inline uint8_t RGBxToVJ(uint16_t r, uint16_t g, uint16_t b) { const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 2) >> \ - 2; \ - uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 2) >> \ - 2; \ - uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 2) >> \ - 2; \ - dst_u[0] = RGBxToUJ(ar, ag, ab); \ - dst_v[0] = RGBxToVJ(ar, ag, ab); \ + uint8_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 2) >> \ + 2; \ + uint8_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 2) >> \ + 2; \ + uint8_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 2) >> \ + 2; \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ @@ -887,8 +829,8 @@ static __inline uint8_t RGBxToVJ(uint16_t r, uint16_t g, uint16_t b) { uint16_t ab = (src_rgb[B] + src_rgb1[B] + 1) >> 1; \ uint16_t ag = (src_rgb[G] + src_rgb1[G] + 1) >> 1; \ uint16_t ar = (src_rgb[R] + src_rgb1[R] + 1) >> 1; \ - dst_u[0] = RGBxToUJ(ar, ag, ab); \ - dst_v[0] = RGBxToVJ(ar, ag, ab); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ } \ } @@ -993,11 +935,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; - uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; - uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); + uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); #endif src_rgb565 += 4; @@ -1021,19 +963,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, g2 = STATIC_CAST(uint8_t, (g2 << 2) | (g2 >> 4)); r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); -#if defined(LIBYUV_ARGBTOUV_PAVGB) uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b2 + 1) >> 1; - uint16_t g = (g0 + g2 + 1) >> 1; - uint16_t r = (r0 + r2 + 1) >> 1; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); -#endif } } @@ -1082,11 +1016,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; - uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; - uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); + uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); #endif src_argb1555 += 4; @@ -1111,19 +1045,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, g2 = STATIC_CAST(uint8_t, (g2 << 3) | (g2 >> 2)); r2 = STATIC_CAST(uint8_t, (r2 << 3) | (r2 >> 2)); -#if defined(LIBYUV_ARGBTOUV_PAVGB) uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b2 + 1) >> 1; - uint16_t g = (g0 + g2 + 1) >> 1; - uint16_t r = (r0 + r2 + 1) >> 1; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); -#endif } } @@ -1168,11 +1094,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else - uint16_t b = (b0 + b1 + b2 + b3 + 2) >> 2; - uint16_t g = (g0 + g1 + g2 + g3 + 2) >> 2; - uint16_t r = (r0 + r1 + r2 + r3 + 2) >> 2; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); + uint8_t b = (b0 + b1 + b2 + b3 + 2) >> 2; + uint8_t g = (g0 + g1 + g2 + g3 + 2) >> 2; + uint8_t r = (r0 + r1 + r2 + r3 + 2) >> 2; + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); #endif src_argb4444 += 4; @@ -1195,19 +1121,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, g2 = STATIC_CAST(uint8_t, (g2 << 4) | g2); r2 = STATIC_CAST(uint8_t, (r2 << 4) | r2); -#if defined(LIBYUV_ARGBTOUV_PAVGB) uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); -#else - uint16_t b = (b0 + b2 + 1) >> 1; - uint16_t g = (g0 + g2 + 1) >> 1; - uint16_t r = (r0 + r2 + 1) >> 1; - dst_u[0] = RGBxToU(r, g, b); - dst_v[0] = RGBxToV(r, g, b); -#endif } } @@ -4036,7 +3954,7 @@ void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { #define MAXTWIDTH 2048 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ - defined(HAS_I422TORGB565ROW_SSSE3) + defined(HAS_I422TORGB565ROW_SSSE3) && !defined(LIBYUV_ENABLE_ROWWIN) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 5c6431aa6..cc2238cc5 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -15,7 +15,9 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) @@ -35,25 +37,6 @@ static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) - -static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0}; - -static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, - 127, -84, -43, 0, 127, -84, -43, 0}; - -static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0, - -43, -84, 127, 0, -43, -84, 127, 0}; - -static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, - -18, -94, 112, 0, -18, -94, 112, 0}; - -static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, - -20, -107, 127, 0, -20, -107, 127, 0}; - -static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0, - 127, -107, -20, 0, 127, -107, -20, 0}; - // Constants for BGRA static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; @@ -62,21 +45,17 @@ static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; -static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, - -38, -74, 112, 0, -38, -74, 112, 0}; - -static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, - 112, -94, -18, 0, 112, -94, -18, 0}; - // Constants for RGBA. static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; - +// 126 (7e) - (-109..110) = 16..235 static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; +static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; -static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; +static const uvec16 kAddUV128 = {0x8000u, 0x8000u, 0x8000u, 0x8000u, + 0x8000u, 0x8000u, 0x8000u, 0x8000u}; static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; @@ -508,35 +487,35 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -546,35 +525,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -1192,21 +1171,21 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("movdqa %3,%%xmm2 \n" + asm volatile("movdqa %3,%%xmm2 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1297,21 +1276,21 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpsrlw $8,%%ymm0,%%ymm0 \n" - "vpsrlw $8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x40(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1320,6 +1299,8 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, } #endif +// clang-format off + // TODO(mraptis): Consider passing R, G, B multipliers as parameter. // round parameter is register containing value to add before shift. #define RGBTOY(round) \ @@ -1344,10 +1325,8 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ "prefetcht0 1280(%0) \n" \ - "paddw %%" #round \ - ",%%xmm6 \n" \ - "paddw %%" #round \ - ",%%xmm1 \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ @@ -1374,10 +1353,8 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "prefetcht0 1280(%0) \n" \ - "vpaddw %%" #round \ - ",%%ymm0,%%ymm0 \n" /* Add .5 */ \ - "vpaddw %%" #round \ - ",%%ymm2,%%ymm2 \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add 16 */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ @@ -1385,8 +1362,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "vmovdqu %%ymm0,(%1) \n" \ "lea 0x20(%1),%1 \n" \ "sub $0x20,%2 \n" \ - "jg 1b \n" \ - "vzeroupper \n" + "jg 1b \n" + +// clang-format on #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. @@ -1394,9 +1372,10 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - "movdqa %5,%%xmm7 \n" + "movdqa %5,%%xmm7 \n" // - LABELALIGN RGBTOY(xmm7) + LABELALIGN "" // + RGBTOY(xmm7) // : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1415,15 +1394,18 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" // - LABELALIGN // - RGBTOY(xmm5) // + LABELALIGN "" // + RGBTOY(xmm7) // : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "m"(kSub128), // %4 + "m"(kAddY0) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ARGBTOYJROW_SSSE3 @@ -1434,14 +1416,18 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" // - LABELALIGN RGBTOY(xmm5) + LABELALIGN "" // + RGBTOY(xmm7) // : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "m"(kSub128), // %4 + "m"(kAddY0) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_ABGRTOYJROW_SSSE3 @@ -1452,14 +1438,18 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" + "movdqa %5,%%xmm7 \n" // - LABELALIGN RGBTOY(xmm5) + LABELALIGN "" // + RGBTOY(xmm7) // : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToYJ), // %3 - "m"(kSub128) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "m"(kSub128), // %4 + "m"(kAddY0) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_RGBATOYJROW_SSSE3 @@ -1478,7 +1468,10 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqa %6,%%ymm6 \n" // - LABELALIGN RGBTOY_AVX2(ymm7) "vzeroupper \n" + + LABELALIGN "" // + RGBTOY_AVX2(ymm7) // + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1498,8 +1491,11 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" - "vmovdqa %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm7) "vzeroupper \n" + "vmovdqa %6,%%ymm6 \n" // + + LABELALIGN "" // + RGBTOY_AVX2(ymm7) // + "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1518,19 +1514,23 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqa %5,%%ymm6 \n" // - LABELALIGN // - RGBTOY_AVX2(ymm5) // + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqa %6,%%ymm6 \n" // + + LABELALIGN "" // + RGBTOY_AVX2(ymm7) // "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kSub128), // %4 - "m"(kPermdARGBToY_AVX) // %5 + "m"(kAddY0), // %5 + "m"(kPermdARGBToY_AVX) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } + #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ABGRTOYJROW_AVX2 @@ -1539,14 +1539,19 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqa %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqa %6,%%ymm6 \n" // + + LABELALIGN "" // + RGBTOY_AVX2(ymm7) // + "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToYJ), // %3 "m"(kSub128), // %4 - "m"(kPermdARGBToY_AVX) // %5 + "m"(kAddY0), // %5 + "m"(kPermdARGBToY_AVX) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } @@ -1558,20 +1563,27 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" - "vmovdqa %5,%%ymm6 \n" // - LABELALIGN RGBTOY_AVX2(ymm5) "vzeroupper \n" + "vbroadcastf128 %5,%%ymm7 \n" + "vmovdqa %6,%%ymm6 \n" // + + LABELALIGN "" // + RGBTOY_AVX2(ymm7) // + "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToYJ), // %3 "m"(kSub128), // %4 - "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + "m"(kAddY0), // %5 + "m"(kPermdARGBToY_AVX) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); } #endif // HAS_RGBATOYJROW_AVX2 #ifdef HAS_ARGBTOUV444ROW_SSSE3 +// Coefficients expressed as negatives to allow 128 struct RgbUVConstants { vec8 kRGBToU; vec8 kRGBToV; @@ -1600,12 +1612,14 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm6,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" + "movdqa %%xmm5,%%xmm1 \n" + "movdqa %%xmm5,%%xmm6 \n" + "psubw %%xmm0,%%xmm1 \n" + "psubw %%xmm2,%%xmm6 \n" + "psrlw $0x8,%%xmm1 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm1 \n" + "movdqu %%xmm1,(%1) \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1617,13 +1631,16 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, "pmaddubsw %%xmm4,%%xmm6 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm6,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" + "movdqa %%xmm5,%%xmm1 \n" + "movdqa %%xmm5,%%xmm6 \n" + "psubw %%xmm0,%%xmm1 \n" + "psubw %%xmm2,%%xmm6 \n" + "psrlw $0x8,%%xmm1 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm1 \n" + "movdqu %%xmm1,0x00(%1,%2,1) \n" + "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" @@ -1664,8 +1681,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, "vpmaddubsw %%ymm3,%%ymm6,%%ymm6 \n" "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates "vphaddw %%ymm6,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsubw %%ymm0,%%ymm5,%%ymm0 \n" + "vpsubw %%ymm2,%%ymm5,%%ymm2 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates @@ -1682,8 +1699,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, "vpmaddubsw %%ymm4,%%ymm6,%%ymm6 \n" "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates "vphaddw %%ymm6,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsubw %%ymm0,%%ymm5,%%ymm0 \n" + "vpsubw %%ymm2,%%ymm5,%%ymm2 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates @@ -1707,6 +1724,78 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX2 +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; + +void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubw %%ymm0,%%ymm5,%%ymm0 \n" + "vpsubw %%ymm1,%%ymm5,%%ymm1 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "subl $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 +#if defined(__i386__) + "+m"(width) // %3 +#else + "+rm"(width) // %3 +#endif + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kAddUV128), // %5 + "m"(rgbuvconstants->kRGBToU), // %6 + "m"(rgbuvconstants->kRGBToV), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} + #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, @@ -1735,7 +1824,6 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1745,6 +1833,7 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm0 \n" @@ -1753,13 +1842,15 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "pmaddubsw %%xmm4,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,0x00(%1,%2,1) \n" + "movdqa %%xmm5,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "psubw %%xmm0,%%xmm2 \n" + "psubw %%xmm1,%%xmm6 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm6 \n" + "packuswb %%xmm6,%%xmm2 \n" + "movlps %%xmm2,(%1) \n" + "movhps %%xmm2,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "subl $0x10,%3 \n" "jg 1b \n" @@ -1792,8 +1883,8 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, // VR 0.875 coefficient = 112 static const struct RgbUVConstants kARGBI601UVConstants = { - {112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0}, - {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0}}; + {-112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0}, + {18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0}}; void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, @@ -1823,10 +1914,18 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, &kARGBI601UVConstants); } +void ARGBToUVRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBI601UVConstants); +} static const struct RgbUVConstants kABGRI601UVConstants = { - {-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0}, - {112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0}}; + {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0}, + {-112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0}}; void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, @@ -1837,9 +1936,18 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, &kABGRI601UVConstants); } +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRI601UVConstants); +} + static const struct RgbUVConstants kBGRAI601UVConstants = { - {0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112}, - {0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18}}; + {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112}, + {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}}; void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, @@ -1851,8 +1959,8 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, } static const struct RgbUVConstants kRGBAI601UVConstants = { - {0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38}, - {0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112}}; + {0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38}, + {0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112}}; void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, @@ -1866,17 +1974,16 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, #ifdef HAS_ARGBTOUVJ444ROW_SSSE3 // RGB to JPEG coefficients -// UB 0.500 coefficient = 127 -// UG -0.33126 coefficient = -84 +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 // UR -0.16874 coefficient = -43 -// VB -0.08131 coefficient = -20 +// VB -0.08131 coefficient = -21 // VG -0.41869 coefficient = -107 -// VR 0.500 coefficient = 127 +// VR 0.500 coefficient = 128 static const struct RgbUVConstants kARGBJPEGUVConstants = { - {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}, - {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, - 0}}; + {-128, 85, 43, 0, -128, 85, 43, 0, -128, 85, 43, 0, -128, 85, 43, 0}, + {21, 107, -128, 0, 21, 107, -128, 0, 21, 107, -128, 0, 21, 107, -128, 0}}; void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, @@ -1885,6 +1992,7 @@ void ARGBToUVJ444Row_SSSE3(const uint8_t* src_argb, ARGBToUV444MatrixRow_SSSE3(src_argb, dst_u, dst_v, width, &kARGBJPEGUVConstants); } + #endif // HAS_ARGBTOUVJ444ROW_SSSE3 #ifdef HAS_ARGBTOUVJ444ROW_AVX2 @@ -1897,6 +2005,28 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJ444ROW_AVX2 +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBJPEGUVConstants); +} + +static const struct RgbUVConstants kABGRJPEGUVConstants = { + {43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0, 43, 85, -128, 0}, + {-128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0, -128, 107, 21, 0}}; + +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRJPEGUVConstants); +} + #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, @@ -1907,11 +2037,6 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, &kARGBJPEGUVConstants); } -static const struct RgbUVConstants kABGRJPEGUVConstants = { - {-43, -84, 127, 0, -43, -84, 127, 0, -43, -84, 127, 0, -43, -84, 127, 0}, - {127, -107, -20, 0, 127, -107, -20, 0, 127, -107, -20, 0, 127, -107, -20, - 0}}; - void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -1922,278 +2047,14 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, } #endif // HAS_ABGRTOUVJROW_SSSE3 -#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ - defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) -// vpshufb for vphaddw + vpackuswb packed to shorts. -static const lvec8 kShufARGBToUV_AVX = { - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -#endif - -#if defined(HAS_ARGBTOUVROW_AVX2) -void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kAddUV128), // %5 - "m"(kARGBToV), // %6 - "m"(kARGBToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ABGRTOUVROW_AVX2 -void ABGRToUVRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kAddUV128), // %5 - "m"(kABGRToV), // %6 - "m"(kABGRToU), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ABGRTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kSub128), // %5 - "m"(kARGBToVJ), // %6 - "m"(kARGBToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ARGBTOUVJROW_AVX2 - -// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix -#ifdef HAS_ABGRTOUVJROW_AVX2 -void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, - int src_stride_abgr, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - asm volatile( - "vbroadcastf128 %5,%%ymm5 \n" - "vbroadcastf128 %6,%%ymm6 \n" - "vbroadcastf128 %7,%%ymm7 \n" - "sub %1,%2 \n" - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vmovdqu 0x40(%0),%%ymm2 \n" - "vmovdqu 0x60(%0),%%ymm3 \n" - "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" - "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" - "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" - "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" - "lea 0x80(%0),%0 \n" - "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" - "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" - "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" - "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" - "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" - - "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" - "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" - "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" - "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" - "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" - "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" - "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm1,%%ymm1 \n" - "vpsraw $0x8,%%ymm0,%%ymm0 \n" - "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %8,%%ymm0,%%ymm0 \n" - - "vextractf128 $0x0,%%ymm0,(%1) \n" - "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_abgr), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+rm"(width) // %3 - : "r"((intptr_t)(src_stride_abgr)), // %4 - "m"(kSub128), // %5 - "m"(kABGRToVJ), // %6 - "m"(kABGRToUJ), // %7 - "m"(kShufARGBToUV_AVX) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} -#endif // HAS_ABGRTOUVJROW_AVX2 - void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN "" // + RGBTOY(xmm7) : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2210,7 +2071,8 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN "" // + RGBTOY(xmm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2227,7 +2089,8 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN "" // + RGBTOY(xmm7) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -2603,12 +2466,12 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -2929,12 +2792,12 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -2961,12 +2824,12 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -3027,12 +2890,12 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -3055,12 +2918,12 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3076,12 +2939,12 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [vu_buf] "+r"(vu_buf), // %[vu_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3099,7 +2962,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, asm volatile( "movdqa %[kShuffleYUY2Y],%%xmm6 \n" "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" @@ -3120,7 +2983,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, asm volatile( "movdqa %[kShuffleUYVYY],%%xmm6 \n" "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" @@ -3140,12 +3003,12 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3161,12 +3024,12 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3995,13 +3858,13 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4030,13 +3893,13 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4105,13 +3968,13 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4139,13 +4002,13 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4215,13 +4078,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4241,13 +4104,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [vu_buf] "+r"(vu_buf), // %[vu_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4269,7 +4132,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, asm volatile( "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -4296,7 +4159,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, asm volatile( "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -4322,13 +4185,13 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4348,13 +4211,13 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4533,16 +4396,16 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4554,18 +4417,18 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4581,16 +4444,16 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4602,18 +4465,18 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4706,17 +4569,17 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" + asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4730,16 +4593,16 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile("vmovdqu %3,%%ymm5 \n" + asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4963,20 +4826,20 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%zmm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" - "lea 0x20(%0),%0 \n" - "vpsllw $0x8,%%zmm1,%%zmm1 \n" - "vporq %%zmm0,%%zmm1,%%zmm2 \n" - "vmovdqu64 %%zmm2,(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -4991,20 +4854,20 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%ymm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x10(%0),%0 \n" - "vpsllw $0x8,%%ymm1,%%ymm1 \n" - "vpor %%ymm0,%%ymm1,%%ymm2 \n" - "vmovdqu %%ymm2,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5019,21 +4882,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5268,24 +5131,24 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile("vpbroadcastw %3,%%zmm2 \n" + asm volatile("vpbroadcastw %3,%%zmm2 \n" // 64 pixels per loop. LABELALIGN - "1: \n" - "vmovups (%0),%%zmm0 \n" - "vmovups 0x40(%0),%%zmm1 \n" - "add $0x80,%0 \n" - "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" - "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" - "vpmovuswb %%zmm0,%%ymm0 \n" - "vpmovuswb %%zmm1,%%ymm1 \n" - "vmovups %%ymm0,(%1) \n" - "vmovups %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "add $0x80,%0 \n" + "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" + "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" + "vpmovuswb %%zmm0,%%ymm0 \n" + "vpmovuswb %%zmm1,%%ymm1 \n" + "vmovups %%ymm0,(%1) \n" + "vmovups %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -6470,7 +6333,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep movsb \n" + asm volatile("rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -6680,7 +6543,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -6689,7 +6552,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosb \n" + asm volatile("rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -6698,7 +6561,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -7859,28 +7722,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -7896,27 +7759,27 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8743,20 +8606,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("movdqu (%3),%%xmm5 \n" + asm volatile("movdqu (%3),%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8771,21 +8634,21 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("vbroadcastf128 (%3),%%ymm5 \n" + asm volatile("vbroadcastf128 (%3),%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8800,24 +8663,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8834,24 +8697,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8868,27 +8731,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8905,27 +8768,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8941,47 +8804,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" + asm volatile("pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -9479,20 +9342,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" + asm volatile("movdqu %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -9503,21 +9366,21 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 diff --git a/source/row_neon.cc b/source/row_neon.cc index 0a1a83d1d..adb80374c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -143,7 +143,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV444 + "1: \n" // + READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -165,7 +166,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" // + READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -188,7 +190,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -211,7 +214,8 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" // + READYUV444 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" @@ -236,7 +240,8 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" @@ -261,9 +266,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 - STORERGBA "bgt 1b \n" + STORERGBA "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -283,7 +289,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -313,7 +320,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. @@ -345,7 +353,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. @@ -379,7 +388,8 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear - "1: \n" READYUV422 YUVTORGB RGBTORGB8 + "1: \n" // + READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTOARGB4444 "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels "bgt 1b \n" @@ -400,7 +410,8 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUV400 YUVTORGB RGBTORGB8 + "1: \n" // + READYUV400 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -437,7 +448,8 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" // + READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -458,7 +470,8 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 + "1: \n" // + READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -479,7 +492,8 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" // + READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -500,7 +514,8 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV21 YUVTORGB RGBTORGB8 + "1: \n" // + READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" @@ -521,7 +536,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READNV12 YUVTORGB RGBTORGB8 + "1: \n" // + READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" @@ -541,7 +557,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READYUY2 YUVTORGB RGBTORGB8 + "1: \n" // + READYUY2 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -560,7 +577,8 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" - "1: \n" READUYVY YUVTORGB RGBTORGB8 + "1: \n" // + READUYVY YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" @@ -1819,6 +1837,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +// Coefficients expressed as negatives to allow 128 struct RgbUVConstants { int8_t kRGBToU[4]; int8_t kRGBToV[4]; @@ -1832,18 +1851,14 @@ static void ARGBToUV444MatrixRow_NEON( int width, const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "vld1.8 {d0}, [%4] \n" // load rgbuvconstants "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient - "vneg.s8 d25, d25 \n" - "vneg.s8 d26, d26 \n" - "vneg.s8 d27, d27 \n" - "vneg.s8 d28, d28 \n" - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vneg.s8 d24, d24 \n" + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1856,8 +1871,8 @@ static void ARGBToUV444MatrixRow_NEON( "vmlsl.u8 q3, d1, d28 \n" // G "vmlsl.u8 q3, d0, d27 \n" // B - "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned - "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned + "vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned + "vaddhn.u16 d1, q3, q15 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. @@ -1871,7 +1886,7 @@ static void ARGBToUV444MatrixRow_NEON( "q15"); } -// RGB to bt601 coefficients +// RGB to BT601 coefficients // UB 0.875 coefficient = 112 // UG -0.5781 coefficient = -74 // UR -0.2969 coefficient = -38 @@ -1879,35 +1894,34 @@ static void ARGBToUV444MatrixRow_NEON( // VG -0.7344 coefficient = -94 // VR 0.875 coefficient = 112 -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0}, - {-18, -94, 112, 0}}; +static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0}, + {18, 94, -112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); + &kARGBI601UVConstants); } // RGB to JPEG coefficients -// UB 0.500 coefficient = 127 -// UG -0.33126 coefficient = -84 +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 // UR -0.16874 coefficient = -43 -// VB -0.08131 coefficient = -20 +// VB -0.08131 coefficient = -21 // VG -0.41869 coefficient = -107 -// VR 0.500 coefficient = 127 +// VR 0.500 coefficient = 128 -static const struct RgbUVConstants kRgb24JPEGUVConstants = { - {127, -84, -43, 0}, - {-20, -107, 127, 0}}; +static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0}, + {21, 107, -128, 0}}; void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24JPEGUVConstants); + &kARGBJPEGUVConstants); } // clang-format off @@ -1936,7 +1950,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1976,12 +1990,12 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q10, #128 \n" // UB/VR 0.500 coefficient + "vmov.s16 q11, #85 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q13, #21 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -2021,12 +2035,12 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q10, #128 \n" // UB/VR 0.500 coefficient + "vmov.s16 q11, #85 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q13, #21 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. @@ -2059,7 +2073,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, ); } -// TODO(fbarchard): Subsample match C code. void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -2067,12 +2080,12 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q10, #128 \n" // UB/VR 0.500 coefficient + "vmov.s16 q11, #85 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q13, #21 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. @@ -2105,7 +2118,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, ); } -// TODO(fbarchard): Subsample match C code. void RAWToUVJRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, @@ -2113,12 +2125,12 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #127 \n" // UB / VR 0.500 coefficient - "vmov.s16 q11, #84 \n" // UG -0.33126 coefficient + "vmov.s16 q10, #128 \n" // UB/VR 0.500 coefficient + "vmov.s16 q11, #85 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 \n" // UR -0.16874 coefficient - "vmov.s16 q13, #20 \n" // VB -0.08131 coefficient + "vmov.s16 q13, #21 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 \n" // VG -0.41869 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. @@ -2163,7 +2175,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. @@ -2208,7 +2220,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. @@ -2253,7 +2265,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. @@ -2298,7 +2310,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. @@ -2343,7 +2355,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. @@ -2389,7 +2401,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2454,7 +2466,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2519,7 +2531,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "vmov.s16 q12, #38 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 \n" // VG -0.7344 coefficient - "vmov.u16 q15, #0x8080 \n" // 128.5 + "vmov.u16 q15, #0x8000 \n" // 128.0 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %4, %4, #16 \n" // 16 processed per loop. @@ -2748,10 +2760,11 @@ struct RgbConstants { // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; +// Add 0.5 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 0x0080}; -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 4b6947e18..fcada9d9e 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -242,7 +242,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV444 + "1: \n" // + READYUV444 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -264,7 +265,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV444 + "1: \n" // + READYUV444 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -289,11 +291,12 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" // + READYUV210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -317,11 +320,12 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" // + READYUV410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -344,11 +348,12 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" // + READYUV212 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -369,7 +374,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV210 + "1: \n" // + READYUV210 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -392,7 +398,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV410 + "1: \n" // + READYUV410 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -417,7 +424,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV212 + "1: \n" // + READYUV212 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -440,7 +448,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -521,12 +530,13 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" // + READYUVP210 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -547,12 +557,13 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" // + READYUVP410 + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -577,7 +588,8 @@ void I422ToAR30Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB STOREAR30 "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -704,7 +716,8 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" "b.gt 1b \n" @@ -726,7 +739,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" @@ -767,7 +781,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. @@ -838,7 +853,8 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 + "1: \n" // + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8 "movi v19.8b, #255 \n" ARGBTOARGB4444 "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 @@ -867,7 +883,8 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ - "1: \n" READYUV400 I400TORGB + "1: \n" // + READYUV400 I400TORGB "subs %w[width], %w[width], #8 \n" RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -928,8 +945,8 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -951,8 +968,8 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -973,8 +990,8 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -995,8 +1012,8 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -1017,7 +1034,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" - "1: \n" READNV12 + "1: \n" // + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 @@ -1042,8 +1060,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV21InterleavedTable]] \n" - "1: \n" READYUY2 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READYUY2 "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] @@ -1063,8 +1081,8 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12InterleavedTable]] \n" - "1: \n" READUYVY - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 + "1: \n" // + READUYVY "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] @@ -2710,6 +2728,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } +// Coefficients expressed as negatives to allow 128 struct RgbUVConstants { int8_t kRGBToU[4]; int8_t kRGBToV[4]; @@ -2729,11 +2748,8 @@ static void ARGBToUV444MatrixRow_NEON( "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient - "neg v25.16b, v25.16b \n" - "neg v26.16b, v26.16b \n" - "neg v27.16b, v27.16b \n" - "neg v28.16b, v28.16b \n" - "movi v29.16b, #0x80 \n" // 128.5 + "neg v24.16b, v24.16b \n" + "movi v29.8h, #0x80, lsl #8 \n" // 128.0 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB @@ -2747,8 +2763,8 @@ static void ARGBToUV444MatrixRow_NEON( "umlsl v3.8h, v1.8b, v28.8b \n" // G "umlsl v3.8h, v0.8b, v27.8b \n" // B - "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned - "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned + "addhn v0.8b, v4.8h, v29.8h \n" // signed -> unsigned + "addhn v1.8b, v3.8h, v29.8h \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. @@ -2768,8 +2784,9 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { - asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" - "movi v29.16b, #0x80 \n" // 128.5 + asm volatile( + "ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" + "movi v29.8h, #0x80, lsl #8 \n" // 128.0 "1: \n" "ldp q0, q1, [%[src]], #32 \n" "subs %w[width], %w[width], #8 \n" // 8 processed per loop. @@ -2784,8 +2801,8 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( "prfm pldl1keep, [%[src], 448] \n" "uzp1 v0.8h, v2.8h, v3.8h \n" "uzp1 v1.8h, v4.8h, v5.8h \n" - "addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned - "addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned + "subhn v0.8b, v29.8h, v0.8h \n" // -signed -> unsigned + "subhn v1.8b, v29.8h, v1.8h \n" "str d0, [%[dst_u]], #8 \n" // store 8 pixels U. "str d1, [%[dst_v]], #8 \n" // store 8 pixels V. "b.gt 1b \n" @@ -2798,7 +2815,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( "v29"); } -// RGB to bt601 coefficients +// RGB to BT601 coefficients // UB 0.875 coefficient = 112 // UG -0.5781 coefficient = -74 // UR -0.2969 coefficient = -38 @@ -2806,15 +2823,15 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( // VG -0.7344 coefficient = -94 // VR 0.875 coefficient = 112 -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0}, - {-18, -94, 112, 0}}; +static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0}, + {18, 94, -112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); + &kARGBI601UVConstants); } void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, @@ -2822,27 +2839,26 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); + &kARGBI601UVConstants); } // RGB to JPEG coefficients -// UB 0.500 coefficient = 127 -// UG -0.33126 coefficient = -84 +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 // UR -0.16874 coefficient = -43 -// VB -0.08131 coefficient = -20 +// VB -0.08131 coefficient = -21 // VG -0.41869 coefficient = -107 -// VR 0.500 coefficient = 127 +// VR 0.500 coefficient = 128 -static const struct RgbUVConstants kRgb24JPEGUVConstants = { - {127, -84, -43, 0}, - {-20, -107, 127, 0}}; +static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0}, + {21, 107, -128, 0}}; void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24JPEGUVConstants); + &kARGBJPEGUVConstants); } void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, @@ -2850,16 +2866,16 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, - &kRgb24JPEGUVConstants); + &kARGBJPEGUVConstants); } -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #112 \n" /* UB/VR coefficient (0.875) */ \ - "movi v21.8h, #74 \n" /* UG coefficient (-0.5781) */ \ - "movi v22.8h, #38 \n" /* UR coefficient (-0.2969) */ \ - "movi v23.8h, #18 \n" /* VB coefficient (-0.1406) */ \ - "movi v24.8h, #94 \n" /* VG coefficient (-0.7344) */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #112 \n" /* UB/VR coefficient (0.875) */ \ + "movi v21.8h, #74 \n" /* UG coefficient (-0.5781) */ \ + "movi v22.8h, #38 \n" /* UR coefficient (-0.2969) */ \ + "movi v23.8h, #18 \n" /* VB coefficient (-0.1406) */ \ + "movi v24.8h, #94 \n" /* VG coefficient (-0.7344) */ \ + "movi v25.8h, #0x80, lsl #8 \n" /* 128.0 (0x8000 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // clang-format off @@ -2925,12 +2941,12 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - "movi v20.8h, #127 \n" // UB/VR coeff (0.500) - "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v20.8h, #128 \n" // UB/VR coeff (0.500) + "movi v21.8h, #85 \n" // UG coeff (-0.33126) "movi v22.8h, #43 \n" // UR coeff (-0.16874) - "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v23.8h, #21 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -2970,12 +2986,12 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr, int width) { const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( - "movi v20.8h, #127 \n" // UB/VR coeff (0.500) - "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v20.8h, #128 \n" // UB/VR coeff (0.500) + "movi v21.8h, #85 \n" // UG coeff (-0.33126) "movi v22.8h, #43 \n" // UR coeff (-0.16874) - "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v23.8h, #21 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -3015,12 +3031,12 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int width) { const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( - "movi v20.8h, #127 \n" // UB/VR coeff (0.500) - "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v20.8h, #128 \n" // UB/VR coeff (0.500) + "movi v21.8h, #85 \n" // UG coeff (-0.33126) "movi v22.8h, #43 \n" // UR coeff (-0.16874) - "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v23.8h, #21 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -3060,12 +3076,12 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw, int width) { const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( - "movi v20.8h, #127 \n" // UB/VR coeff (0.500) - "movi v21.8h, #84 \n" // UG coeff (-0.33126) + "movi v20.8h, #128 \n" // UB/VR coeff (0.500) + "movi v21.8h, #85 \n" // UG coeff (-0.33126) "movi v22.8h, #43 \n" // UR coeff (-0.16874) - "movi v23.8h, #20 \n" // VB coeff (-0.08131) + "movi v23.8h, #21 \n" // VB coeff (-0.08131) "movi v24.8h, #107 \n" // VG coeff (-0.41869) - "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -3606,12 +3622,13 @@ static void ARGBToYMatrixRow_NEON_DotProd( // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 -// Add 0.5 = 0x80 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; +// Add 0.5 +static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + 0x0080}; static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, - 128}; + 0x0080}; -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; +static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 diff --git a/source/row_sve.cc b/source/row_sve.cc index 27bf87a6c..ba89b163a 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -203,6 +203,15 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, // elements flipped to account for the interleaving nature of the widening // addition instructions. +// RGB to BT601 coefficients +// UB 0.875 coefficient = 112 +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 + +// SVE constants are not negated static const int16_t kARGBToUVCoefficients[] = { // UB, -UR, -UG, 0, -VB, VR, -VG, 0 112, -38, -74, 0, -18, 112, -94, 0, @@ -223,14 +232,22 @@ static const int16_t kABGRToUVCoefficients[] = { -38, 112, -74, 0, 112, -18, -94, 0, }; +// RGB to JPEG coefficients +// UB 0.500 coefficient = 128 +// UG -0.33126 coefficient = -85 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -21 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 128 + static const int16_t kARGBToUVJCoefficients[] = { // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 127, -43, -84, 0, -20, 127, -107, 0, + 128, -43, -85, 0, -21, 128, -107, 0, }; static const int16_t kABGRToUVJCoefficients[] = { // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -43, 127, -84, 0, 127, -20, -107, 0, + -43, 128, -85, 0, 128, -21, -107, 0, }; static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, @@ -245,8 +262,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, "ptrue p0.b \n" "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" - "mov z26.b, #0x80 \n" - + "mov z26.h, #0x8000 \n" // 128.0 (0x8000) "cntb %[vl] \n" "subs %w[width], %w[width], %w[vl] \n" "b.lt 2f \n" diff --git a/source/row_win.cc b/source/row_win.cc index 5fb28521e..1a57ee4f5 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -12,7 +12,8 @@ // This module is for Visual C 32/64 bit #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) + (defined(_M_IX86) || defined(_M_X64)) && \ + (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) #if defined(_M_ARM64EC) #include @@ -182,15 +183,52 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // 32 bit #else // defined(_M_X64) -#ifdef HAS_ARGBTOYROW_SSSE3 -// Constants for ARGB. -static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0}; +#ifdef HAS_ARGBTOUVROW_SSSE3 + +// 8 bit fixed point 0.5, for bias of UV. +static const ulvec8 kBiasUV128 = { + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +// NV21 shuf 8 VU to 16 UV. +static const lvec8 kShuffleNV21 = { + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, + 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, +}; + +// YUY2 shuf 16 Y to 32 Y. +static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, + 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, + 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; + +// YUY2 shuf 8 UV to 16 UV. +static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, + 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, + 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; + +// UYVY shuf 16 Y to 32 Y. +static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, + 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, + 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; + +// UYVY shuf 8 UV to 16 UV. +static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, + 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, + 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // JPeg full range. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0}; +#endif + +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; + +// Constants for ARGB. +static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, + 13, 65, 33, 0, 13, 65, 33, 0}; static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0}; @@ -246,12 +284,6 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, // 7 bit fixed point 0.5. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; -// 8 bit fixed point 0.5, for bias of UV. -static const ulvec8 kBiasUV128 = { - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, - 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; - // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; @@ -287,32 +319,6 @@ static const uvec8 kShuffleMaskARGBToRAW = { static const uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; -// YUY2 shuf 16 Y to 32 Y. -static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, - 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, - 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; - -// YUY2 shuf 8 UV to 16 UV. -static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, - 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, - 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; - -// UYVY shuf 16 Y to 32 Y. -static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, - 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, - 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; - -// UYVY shuf 8 UV to 16 UV. -static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, - 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, - 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; - -// NV21 shuf 8 VU to 16 UV. -static const lvec8 kShuffleNV21 = { - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, - 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, -}; - // Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, @@ -1240,8 +1246,6 @@ __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, } #ifdef HAS_ARGBTOYROW_AVX2 -// vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, @@ -1511,7 +1515,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width + // TODO: change biasuv to 0x8000 movdqa xmm5, xmmword ptr kBiasUV128 + // TODO: use negated coefficients to allow -128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v @@ -1552,10 +1558,12 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 + // TODO: negate by subtracting from 0x8000 paddw xmm0, xmm5 // +.5 rounding -> unsigned paddw xmm1, xmm5 psraw xmm0, 8 psraw xmm1, 8 + // TODO: packuswb packsswb xmm0, xmm1 // step 3 - store 8 U and 8 V values @@ -1981,7 +1989,6 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, ret } } -#endif // HAS_ARGBTOYROW_SSSE3 // Read 16 UV from 444 #define READYUV444_AVX2 \ diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index bab60a213..84e765091 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -17,7 +17,9 @@ extern "C" { #endif // This module is for GCC x86 and x64. -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, @@ -1761,25 +1763,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -1792,23 +1794,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 diff --git a/source/scale_win.cc b/source/scale_win.cc index ea1f95c6c..32c0506fa 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -17,8 +17,8 @@ extern "C" { #endif // This module is for 32 bit Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - !defined(__clang__) && defined(_M_IX86) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_IX86) && \ + (!defined(__clang__) || defined(LIBYUV_ENABLE_ROWWIN)) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index eb0d4bbd9..82163cb77 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -2076,7 +2076,7 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { } uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381); - EXPECT_EQ(4157186353u, checksum); + EXPECT_EQ(223551344u, checksum); free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_j420); @@ -2104,7 +2104,7 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) { } uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381); - EXPECT_EQ(1526656597u, checksum); + EXPECT_EQ(4197774805u, checksum); free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_i420);