diff --git a/README.chromium b/README.chromium index ed99899b2..4aa32334e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1931 +Version: 1932 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5ce895075..2c2fa55bf 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -351,7 +351,9 @@ extern "C" { ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) #define HAS_RAWTOARGBROW_AVX2 -#define HAS_RAWTOARGBROW_AVX512BW +#if defined(__x86_64__) || defined(_M_X64) +#define HAS_RAWTOARGBROW_AVX512VBMI +#endif #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYMATRIXROW_AVX2 #define HAS_ABGRTOYROW_AVX2 @@ -369,7 +371,9 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \ !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_COPYROW_AVX512BW -#define HAS_RAWTOARGBROW_AVX512BW +#if defined(__x86_64__) || defined(_M_X64) +#define HAS_RAWTOARGBROW_AVX512VBMI +#endif #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW @@ -3976,7 +3980,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -4068,7 +4072,7 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index e47b9fe5e..f7e2123a7 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -2019,7 +2019,7 @@ static const int8_t kABGRToUVJCoefficients[] = { 43, 85, -128, 0, -128, 107, 21, 0, }; -#define ABCDTOUVMATRIX_SVE \ +#define ARGBTOUVMATRIX_SVE \ "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ @@ -2113,7 +2113,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "ptrue p4.d \n" "ptrue p5.h \n" "1: \n" // - ABCDTOUVMATRIX_SVE + ARGBTOUVMATRIX_SVE "b.gt 1b \n" "2: \n" @@ -2126,7 +2126,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb, "whilelt p3.d, %w[vl2], %w[width] \n" "whilelt p4.d, %w[vl3], %w[width] \n" "whilelt p5.h, wzr, %w[width] \n" // - ABCDTOUVMATRIX_SVE + ARGBTOUVMATRIX_SVE "b.gt 3b \n" "99: \n" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2719ee772..27cb40597 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1931 +#define LIBYUV_VERSION 1932 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 40d6a9fcb..562f15b04 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3413,6 +3413,56 @@ int RAWToI420(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; + if (IS_ALIGNED(width, 64)) { + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif +#if defined(HAS_RAWTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToARGBRow = RAWToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToARGBRow = RAWToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -3568,14 +3618,48 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif +#if defined(HAS_RAWTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToARGBRow = RAWToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToARGBRow = RAWToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -3818,11 +3902,11 @@ int RAWToI444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif @@ -4037,11 +4121,11 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif @@ -4955,11 +5039,11 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif @@ -4971,6 +5055,11 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif #if defined(HAS_RAWTOARGBROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { RAWToARGBRow = RAWToARGBRow_Any_LSX; @@ -4993,7 +5082,6 @@ int RAWToJ400(const uint8_t* src_raw, } #endif - { // Allocate 1 row of ARGB. const int row_size = (width * 4 + 31) & ~31; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 615ba7741..af26a6e9c 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3723,11 +3723,11 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 3f67da8b4..916016733 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -4184,11 +4184,11 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) - if (TestCpuFlag(kCpuHasAVX512BW)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) + if (TestCpuFlag(kCpuHasAVX512VBMI)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512BW; + RAWToARGBRow = RAWToARGBRow_AVX512VBMI; } } #endif @@ -4200,6 +4200,11 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } } #endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif #if defined(HAS_RAWTOARGBROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { RAWToARGBRow = RAWToARGBRow_Any_LSX; @@ -4221,8 +4226,6 @@ int RAWToNV21Matrix(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif - - #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 04e7c4639..0ddf867b8 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1000,8 +1000,8 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif -#if defined(HAS_RAWTOARGBROW_AVX512BW) -ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) +#if defined(HAS_RAWTOARGBROW_AVX512VBMI) +ANY11(RAWToARGBRow_Any_AVX512VBMI, RAWToARGBRow_AVX512VBMI, 0, 3, 4, 63) #endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 33066148a..89127e4d5 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -262,42 +262,33 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#ifdef HAS_COPYROW_AVX512BW +#ifdef HAS_RAWTOARGBROW_AVX512VBMI +static const uint8_t kPermRAWToARGB_AVX512VBMI[64] = { + 2, 1, 0, 48, 5, 4, 3, 48, 8, 7, 6, 48, 11, 10, 9, 48, + 14, 13, 12, 48, 17, 16, 15, 48, 20, 19, 18, 48, 23, 22, 21, 48, + 26, 25, 24, 48, 29, 28, 27, 48, 32, 31, 30, 48, 35, 34, 33, 48, + 38, 37, 36, 48, 41, 40, 39, 48, 44, 43, 42, 48, 47, 46, 45, 48}; + // TODO(fbarchard): optimize this with a mask or vpermb -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { +void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 - "vbroadcasti32x4 %3,%%zmm4 \n" // - "vbroadcasti32x4 %4,%%zmm5 \n" // + "movabs $0xffffffffffff,%%rax \n" // 48 bytes mask + "kmovq %%rax,%%k1 \n" + "vmovdqu8 %3,%%zmm5 \n" LABELALIGN // "1: \n" - "vmovdqu (%0),%%xmm0 \n" - "vinserti32x4 $1,12(%0),%%zmm0,%%zmm0 \n" - "vinserti32x4 $2,24(%0),%%zmm0,%%zmm0 \n" - "vinserti32x4 $3,36(%0),%%zmm0,%%zmm0 \n" - - "vmovdqu 48(%0),%%xmm1 \n" - "vinserti32x4 $1,60(%0),%%zmm1,%%zmm1 \n" - "vinserti32x4 $2,72(%0),%%zmm1,%%zmm1 \n" - "vinserti32x4 $3,84(%0),%%zmm1,%%zmm1 \n" - - "vmovdqu 96(%0),%%xmm2 \n" - "vinserti32x4 $1,108(%0),%%zmm2,%%zmm2 \n" - "vinserti32x4 $2,120(%0),%%zmm2,%%zmm2 \n" - "vinserti32x4 $3,132(%0),%%zmm2,%%zmm2 \n" - - "vmovdqu 140(%0),%%xmm3 \n" - "vinserti32x4 $1,152(%0),%%zmm3,%%zmm3 \n" - "vinserti32x4 $2,164(%0),%%zmm3,%%zmm3 \n" - "vinserti32x4 $3,176(%0),%%zmm3,%%zmm3 \n" - + "vmovdqu8 (%0),%%zmm0%{%%k1%}%{z%} \n" + "vmovdqu8 48(%0),%%zmm1%{%%k1%}%{z%} \n" + "vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n" + "vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n" "lea 192(%0),%0 \n" - "vpshufb %%zmm4,%%zmm0,%%zmm0 \n" - "vpshufb %%zmm4,%%zmm1,%%zmm1 \n" - "vpshufb %%zmm4,%%zmm2,%%zmm2 \n" - "vpshufb %%zmm5,%%zmm3,%%zmm3 \n" + "vpermb %%zmm0,%%zmm5,%%zmm0 \n" + "vpermb %%zmm1,%%zmm5,%%zmm1 \n" + "vpermb %%zmm2,%%zmm5,%%zmm2 \n" + "vpermb %%zmm3,%%zmm5,%%zmm3 \n" "vpord %%zmm6,%%zmm0,%%zmm0 \n" "vpord %%zmm6,%%zmm1,%%zmm1 \n" "vpord %%zmm6,%%zmm2,%%zmm2 \n" @@ -313,9 +304,8 @@ void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "m"(kShuffleMaskRAWToARGB), // %3 - "m"(kShuffleMaskRAWToARGB_0) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "m"(kPermRAWToARGB_AVX512VBMI) // %3 + : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm5", "zmm6"); } #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3e5ce489d..09bad8df9 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3449,7 +3449,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. -static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src, +static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src, int src_stride, uint8_t* dst_u, uint8_t* dst_v, @@ -3551,7 +3551,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVCoefficients); } @@ -3560,7 +3560,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVCoefficients); } @@ -3569,7 +3569,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, kBGRAToUVCoefficients); } @@ -3578,7 +3578,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, kRGBAToUVCoefficients); } @@ -3606,7 +3606,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVJCoefficients); } @@ -3615,7 +3615,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVJCoefficients); } diff --git a/source/row_win.cc b/source/row_win.cc index 1defcb467..e9080d19d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -261,40 +261,26 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } #endif -#ifdef HAS_RAWTOARGBROW_AVX512BW +#ifdef HAS_RAWTOARGBROW_AVX512VBMI LIBYUV_TARGET_AVX512BW -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { +void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { __m512i zmm_alpha = _mm512_set1_epi32(0xff000000); - __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6); - __m512i zmm_shuf = _mm512_broadcast_i32x4(shuf_low); - __m512i zmm_shuf2 = _mm512_broadcast_i32x4(shuf_high); + __m512i zmm_shuf = _mm512_set_epi8( + 48, 45, 46, 47, 48, 42, 43, 44, 48, 39, 40, 41, 48, 36, 37, 38, + 48, 33, 34, 35, 48, 30, 31, 32, 48, 27, 28, 29, 48, 24, 25, 26, + 48, 21, 22, 23, 48, 18, 19, 20, 48, 15, 16, 17, 48, 12, 13, 14, + 48, 9, 10, 11, 48, 6, 7, 8, 48, 3, 4, 5, 48, 0, 1, 2); while (width > 0) { - __m512i zmm0 = _mm512_castsi128_si512(_mm_loadu_si128((const __m128i*)src_raw)); - zmm0 = _mm512_inserti32x4(zmm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1); - zmm0 = _mm512_inserti32x4(zmm0, _mm_loadu_si128((const __m128i*)(src_raw + 24)), 2); - zmm0 = _mm512_inserti32x4(zmm0, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 3); + __m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw); + __m512i zmm1 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 48); + __m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96); + __m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144); - __m512i zmm1 = _mm512_castsi128_si512(_mm_loadu_si128((const __m128i*)(src_raw + 48))); - zmm1 = _mm512_inserti32x4(zmm1, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1); - zmm1 = _mm512_inserti32x4(zmm1, _mm_loadu_si128((const __m128i*)(src_raw + 72)), 2); - zmm1 = _mm512_inserti32x4(zmm1, _mm_loadu_si128((const __m128i*)(src_raw + 84)), 3); - - __m512i zmm2 = _mm512_castsi128_si512(_mm_loadu_si128((const __m128i*)(src_raw + 96))); - zmm2 = _mm512_inserti32x4(zmm2, _mm_loadu_si128((const __m128i*)(src_raw + 108)), 1); - zmm2 = _mm512_inserti32x4(zmm2, _mm_loadu_si128((const __m128i*)(src_raw + 120)), 2); - zmm2 = _mm512_inserti32x4(zmm2, _mm_loadu_si128((const __m128i*)(src_raw + 132)), 3); - - __m512i zmm3 = _mm512_castsi128_si512(_mm_loadu_si128((const __m128i*)(src_raw + 140))); - zmm3 = _mm512_inserti32x4(zmm3, _mm_loadu_si128((const __m128i*)(src_raw + 152)), 1); - zmm3 = _mm512_inserti32x4(zmm3, _mm_loadu_si128((const __m128i*)(src_raw + 164)), 2); - zmm3 = _mm512_inserti32x4(zmm3, _mm_loadu_si128((const __m128i*)(src_raw + 176)), 3); - - zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf); - zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf); - zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf); - zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf2); + zmm0 = _mm512_permutexvar_epi8(zmm_shuf, zmm0); + zmm1 = _mm512_permutexvar_epi8(zmm_shuf, zmm1); + zmm2 = _mm512_permutexvar_epi8(zmm_shuf, zmm2); + zmm3 = _mm512_permutexvar_epi8(zmm_shuf, zmm3); zmm0 = _mm512_or_si512(zmm0, zmm_alpha); zmm1 = _mm512_or_si512(zmm1, zmm_alpha);