diff --git a/README.chromium b/README.chromium index e26ca693d..94ca08823 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1925 +Version: 1926 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 10e050f21..2816102f5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -383,6 +383,9 @@ extern "C" { #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW #define HAS_ARGBTOUVJ444ROW_AVX512BW +#define HAS_ARGBTOUVROW_AVX512BW +#define HAS_ARGBTOUVJROW_AVX512BW +#define HAS_ARGBTOUVMATRIXROW_AVX512BW #endif // The following are available on Neon platforms: @@ -2157,6 +2160,12 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToUV444MatrixRow_C(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2214,6 +2223,12 @@ void ARGBToUVMatrixRow_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToUV444MatrixRow_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2358,6 +2373,26 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_AVX512BW(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2408,6 +2443,26 @@ void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVRow_Any_AVX512BW(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVRow_Any_AVX512BW(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJRow_Any_AVX512BW(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ABGRToUVJRow_Any_AVX512BW(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cb8f2b4de..d3e258095 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1925 +#define LIBYUV_VERSION 1926 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 25d934a32..07a58f602 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2107,6 +2107,14 @@ int ARGBToI420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGBToYRow = ARGBToYRow_Any_LSX; @@ -2369,6 +2377,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGBToYRow = ARGBToYRow_Any_LSX; @@ -3030,6 +3046,14 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #endif // HAS_RGB24TOYROW { @@ -3392,6 +3416,14 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #endif // HAS_RAWTOYROW { @@ -4182,6 +4214,14 @@ int RGB565ToI420(const uint8_t* src_rgb565, ARGBToUVRow = ARGBToUVRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } #endif { #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_LSX) || \ @@ -4365,6 +4405,14 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, ARGBToUVRow = ARGBToUVRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } #endif { #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_LSX) || \ @@ -4543,6 +4591,14 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGBToYRow = ARGBToYRow_Any_LSX; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index b2c0bd690..7f7be08ea 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -336,6 +336,14 @@ int ARGBToI422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -609,6 +617,14 @@ int ARGBToNV12(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGBToYRow = ARGBToYRow_Any_LSX; @@ -907,6 +923,14 @@ int ARGBToNV21(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -1131,6 +1155,14 @@ int ABGRToNV12(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToUVRow = ABGRToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; @@ -1336,6 +1368,14 @@ int ABGRToNV21(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToUVRow = ABGRToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; @@ -1554,6 +1594,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -1770,6 +1818,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVRow = ARGBToUVRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -2938,6 +2994,14 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJRow = ARGBToUVJRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { ARGBToYJRow = ARGBToYJRow_Any_LSX; @@ -3046,6 +3110,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJRow = ARGBToUVJRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; @@ -3364,6 +3436,14 @@ int ABGRToJ420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToUVJRow = ABGRToUVJRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYJRow = ABGRToYJRow_Any_NEON; @@ -3516,6 +3596,14 @@ int ABGRToJ422(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ABGRToUVJRow = ABGRToUVJRow_AVX512BW; + } + } +#endif #if defined(HAS_ABGRTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYJRow = ABGRToYJRow_Any_NEON; @@ -3942,6 +4030,14 @@ int RAWToJNV21(const uint8_t* src_raw, } } #endif +#if defined(HAS_ARGBTOUVJROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVJRow = ARGBToUVJRow_AVX512BW; + } + } +#endif #endif // HAS_RAWTOYJROW #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { diff --git a/source/row_any.cc b/source/row_any.cc index a4ba290dc..8ac48d3c0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2208,8 +2208,8 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ - SIMD_ALIGNED(uint8_t vin[128 * 2]); \ - SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + SIMD_ALIGNED(uint8_t vin[256 * 2]); \ + SIMD_ALIGNED(uint8_t vout[256 * 2]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -2218,17 +2218,17 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) } \ ptrdiff_t np = n; \ memcpy(vin, src_ptr + (np >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(vin + 128, src_ptr + src_stride + (np >> UVSHIFT) * BPP, \ + memcpy(vin + 256, src_ptr + src_stride + (np >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ - memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ - vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + memcpy(vin + 256 + SS(r, UVSHIFT) * BPP, \ + vin + 256 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ - ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1); \ + ANY_SIMD(vin, 256, vout, vout + 256, MASK + 1); \ memcpy(dst_u + (np >> 1), vout, SS(r, 1)); \ - memcpy(dst_v + (np >> 1), vout + 128, SS(r, 1)); \ + memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } #define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK) \ @@ -2251,8 +2251,8 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) #define ANY12MS(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width, const struct ArgbConstants* c) { \ - SIMD_ALIGNED(uint8_t vin[128 * 2]); \ - SIMD_ALIGNED(uint8_t vout[128 * 2]); \ + SIMD_ALIGNED(uint8_t vin[256 * 2]); \ + SIMD_ALIGNED(uint8_t vout[256 * 2]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -2261,22 +2261,25 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) } \ ptrdiff_t np = n; \ memcpy(vin, src_ptr + (np >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(vin + 128, src_ptr + src_stride + (np >> UVSHIFT) * BPP, \ + memcpy(vin + 256, src_ptr + src_stride + (np >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(vin + SS(r, UVSHIFT) * BPP, vin + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ - memcpy(vin + 128 + SS(r, UVSHIFT) * BPP, \ - vin + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + memcpy(vin + 256 + SS(r, UVSHIFT) * BPP, \ + vin + 256 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ - ANY_SIMD(vin, 128, vout, vout + 128, MASK + 1, c); \ + ANY_SIMD(vin, 256, vout, vout + 256, MASK + 1, c); \ memcpy(dst_u + (np >> 1), vout, SS(r, 1)); \ - memcpy(dst_v + (np >> 1), vout + 128, SS(r, 1)); \ + memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } #ifdef HAS_ARGBTOUVMATRIXROW_AVX2 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #endif +#ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW +ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63) +#endif #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7) #endif @@ -2326,15 +2329,27 @@ ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVROW_AVX512BW +ANY12S(ARGBToUVRow_Any_AVX512BW, ARGBToUVRow_AVX512BW, 0, 4, 63) +#endif #ifdef HAS_ABGRTOUVROW_AVX2 ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVROW_AVX512BW +ANY12S(ABGRToUVRow_Any_AVX512BW, ABGRToUVRow_AVX512BW, 0, 4, 63) +#endif #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ARGBTOUVJROW_AVX512BW +ANY12S(ARGBToUVJRow_Any_AVX512BW, ARGBToUVJRow_AVX512BW, 0, 4, 63) +#endif #ifdef HAS_ABGRTOUVJROW_AVX2 ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVJROW_AVX512BW +ANY12S(ABGRToUVJRow_Any_AVX512BW, ABGRToUVJRow_AVX512BW, 0, 4, 63) +#endif #ifdef HAS_ARGBTOUVJROW_SSSE3 ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index ba74ef87e..9ed7fce9c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1493,11 +1493,16 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, } #endif -#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) +#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW) static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; #endif +#if defined(HAS_ARGBTOUVROW_AVX512BW) || defined(HAS_ARGBTOUVJROW_AVX512BW) +static const uint32_t kPermdARGBToUV_AVX512BW[16] = { + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; +#endif + #ifdef HAS_ARGBTOYROW_AVX512BW void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, @@ -2157,6 +2162,129 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, } #endif // HAS_ABGRTOUVJROW_AVX2 +#ifdef HAS_ARGBTOUVROW_AVX512BW + +// 32x2 -> 16x1 ARGB pixels converted to 16 U and 16 V +// ARGBToUV does rounding average of 4 ARGB pixels + +void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + asm volatile( + "vbroadcasti64x4 0x20(%5),%%zmm4 \n" // RGBToU + "vbroadcasti64x4 0x40(%5),%%zmm5 \n" // RGBToV + "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" + "vpabsb %%zmm16,%%zmm6 \n" // 0x0101 + "vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000 + "vbroadcasti64x4 %6,%%zmm7 \n" // kShuffleAARRGGBB + "vmovups %7,%%zmm18 \n" // kPermdARGBToY_AVX512BW + "vmovups %8,%%zmm19 \n" // kPermdARGBToUV_AVX512BW + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovups (%0),%%zmm0 \n" // Read 32x2 ARGB Pixels + "vmovups 0x40(%0),%%zmm1 \n" + "vmovups 0x00(%0,%4,1),%%zmm2 \n" + "vmovups 0x40(%0,%4,1),%%zmm3 \n" + "vpshufb %%zmm7,%%zmm0,%%zmm0 \n" // aarrggbb + "vpshufb %%zmm7,%%zmm1,%%zmm1 \n" + "vpshufb %%zmm7,%%zmm2,%%zmm2 \n" + "vpshufb %%zmm7,%%zmm3,%%zmm3 \n" + "vpmaddubsw %%zmm6,%%zmm0,%%zmm0 \n" // 32x2 -> 16x2 + "vpmaddubsw %%zmm6,%%zmm1,%%zmm1 \n" + "vpmaddubsw %%zmm6,%%zmm2,%%zmm2 \n" + "vpmaddubsw %%zmm6,%%zmm3,%%zmm3 \n" + "vpaddw %%zmm0,%%zmm2,%%zmm0 \n" // 16x2 -> 16x1 + "vpaddw %%zmm1,%%zmm3,%%zmm1 \n" + "vpxorq %%zmm2,%%zmm2,%%zmm2 \n" // 0 for vpavgw + "vpsrlw $1,%%zmm0,%%zmm0 \n" + "vpsrlw $1,%%zmm1,%%zmm1 \n" + "vpavgw %%zmm2,%%zmm0,%%zmm0 \n" + "vpavgw %%zmm2,%%zmm1,%%zmm1 \n" + "vpackuswb %%zmm1,%%zmm0,%%zmm0 \n" // mutates + "vpermd %%zmm0,%%zmm19,%%zmm0 \n" // unscramble pixels + + "vpmaddubsw %%zmm4,%%zmm0,%%zmm1 \n" // 16 U + "vpmaddubsw %%zmm5,%%zmm0,%%zmm0 \n" // 16 V + "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" + "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" + "vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V in upper) + "vpaddw %%zmm17,%%zmm0,%%zmm0 \n" + "vpsrlw $0x8,%%zmm0,%%zmm0 \n" + "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" // mutates + "vpermd %%zmm0,%%zmm18,%%zmm0 \n" // unmutate + + "vmovdqu %%xmm0,(%1) \n" // Write 16 U's + "vextracti32x4 $0x1,%%zmm0,%%xmm0 \n" + "vmovdqu %%xmm0,0x00(%1,%2,1) \n" // Write 16 V's + + "lea 0x80(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "subl $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 +#if defined(__i386__) + "+m"(width) // %3 +#else + "+rm"(width) // %3 +#endif + : "r"((intptr_t)(src_stride_argb)), // %4 + "r"(c), // %5 + "m"(kShuffleAARRGGBB), // %6 + "m"(kPermdARGBToY_AVX512BW), // %7 + "m"(kPermdARGBToUV_AVX512BW) // %8 + : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", + "zmm7", "zmm16", "zmm17", "zmm18", "zmm19"); +} + +void ARGBToUVRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX512BW(src_argb, src_stride_argb, dst_u, dst_v, width, + &kArgbI601Constants); +} + +void ABGRToUVRow_AVX512BW(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX512BW(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kAbgrI601Constants); +} + +#ifdef HAS_ARGBTOUVJROW_AVX512BW +void ARGBToUVJRow_AVX512BW(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX512BW(src_argb, src_stride_argb, dst_u, dst_v, width, + &kArgbJPEGConstants); +} +#endif // HAS_ARGBTOUVJROW_AVX512BW + +#ifdef HAS_ABGRTOUVJROW_AVX512BW +void ABGRToUVJRow_AVX512BW(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX512BW(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kAbgrJPEGConstants); +} +#endif // HAS_ABGRTOUVJROW_AVX512BW +#endif // HAS_ARGBTOUVROW_AVX512BW + void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { ARGBToYMatrixRow_SSSE3(src_bgra, dst_y, width, &kBgraI601Constants); }