From 4afb9654162e142b52d349471823815f4c60bc3d Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 22 Apr 2026 15:20:00 -0700 Subject: [PATCH] RAWToARGB use AVX512BW Bug: libyuv:42280902 Change-Id: I7a80fd64d97b6d411316819df0fd917d609a173b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7787163 Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- include/libyuv/row.h | 12 ++- source/convert.cc | 159 ++++++++++++++++++++++++++++++------ source/convert_argb.cc | 27 ++++-- source/convert_from_argb.cc | 8 +- source/row_any.cc | 7 +- source/row_gcc.cc | 46 +++++++---- source/row_win.cc | 37 ++++++--- 7 files changed, 227 insertions(+), 69 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2c2fa55bf..3b2c52aaa 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -352,7 +352,8 @@ extern "C" { defined(LIBYUV_ENABLE_ROWWIN)) #define HAS_RAWTOARGBROW_AVX2 #if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512VBMI +#define HAS_RAWTOARGBROW_AVX512BW +#define HAS_RGB24TOARGBROW_AVX512BW #endif #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYMATRIXROW_AVX2 @@ -372,7 +373,8 @@ extern "C" { !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_COPYROW_AVX512BW #if defined(__x86_64__) || defined(_M_X64) -#define HAS_RAWTOARGBROW_AVX512VBMI +#define HAS_RAWTOARGBROW_AVX512BW +#define HAS_RGB24TOARGBROW_AVX512BW #endif #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW @@ -3980,7 +3982,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -4072,7 +4074,9 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/source/convert.cc b/source/convert.cc index 562f15b04..a1a7ba9bf 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3077,6 +3077,56 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RGB24ToARGBRow = RGB24ToARGBRow_SVE2; + } +#endif +#if defined(HAS_RGB24TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToARGBRow = RGB24ToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -3220,6 +3270,56 @@ int RGB24ToJ420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToARGBRow = RGB24ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RGB24ToARGBRow = RGB24ToARGBRow_SVE2; + } +#endif +#if defined(HAS_RGB24TOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RGB24ToARGBRow = RGB24ToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToARGBRow = RGB24ToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -3421,11 +3521,11 @@ int RAWToI420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif @@ -3618,11 +3718,11 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif @@ -3902,11 +4002,11 @@ int RAWToI444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif @@ -4121,11 +4221,11 @@ int RAWToJ444(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif @@ -4878,11 +4978,19 @@ int RGB24ToJ400(const uint8_t* src_rgb24, #if defined(HAS_RGB24TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { RGB24ToARGBRow = RGB24ToARGBRow_AVX2; } } #endif +#if defined(HAS_RGB24TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; @@ -4891,6 +4999,11 @@ int RGB24ToJ400(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RGB24ToARGBRow = RGB24ToARGBRow_SVE2; + } +#endif #if defined(HAS_RGB24TOARGBROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX; @@ -4912,9 +5025,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif - - - { +{ // Allocate 1 row of ARGB. const int row_size = (width * 4 + 31) & ~31; align_buffer_64(row, row_size); @@ -5039,11 +5150,11 @@ int RAWToJ400(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif diff --git a/source/convert_argb.cc b/source/convert_argb.cc index af26a6e9c..7672a6692 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3638,6 +3638,22 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_RGB24TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; @@ -3672,8 +3688,7 @@ int RGB24ToARGB(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif - - for (y = 0; y < height; ++y) { +for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); src_rgb24 += src_stride_rgb24; dst_argb += dst_stride_argb; @@ -3723,11 +3738,11 @@ int RAWToARGB(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 916016733..7e7a4f8cf 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -4184,11 +4184,11 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) - if (TestCpuFlag(kCpuHasAVX512VBMI)) { - RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI; +#if defined(HAS_RAWTOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RAWToARGBRow = RAWToARGBRow_Any_AVX512BW; if (IS_ALIGNED(width, 64)) { - RAWToARGBRow = RAWToARGBRow_AVX512VBMI; + RAWToARGBRow = RAWToARGBRow_AVX512BW; } } #endif diff --git a/source/row_any.cc b/source/row_any.cc index 0ddf867b8..9fa7227b7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1000,8 +1000,11 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif -#if defined(HAS_RAWTOARGBROW_AVX512VBMI) -ANY11(RAWToARGBRow_Any_AVX512VBMI, RAWToARGBRow_AVX512VBMI, 0, 3, 4, 63) +#if defined(HAS_RAWTOARGBROW_AVX512BW) +ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) +#endif +#if defined(HAS_RGB24TOARGBROW_AVX512BW) +ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63) #endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 89127e4d5..228afe061 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -262,21 +262,18 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#ifdef HAS_RAWTOARGBROW_AVX512VBMI -static const uint8_t kPermRAWToARGB_AVX512VBMI[64] = { - 2, 1, 0, 48, 5, 4, 3, 48, 8, 7, 6, 48, 11, 10, 9, 48, - 14, 13, 12, 48, 17, 16, 15, 48, 20, 19, 18, 48, 23, 22, 21, 48, - 26, 25, 24, 48, 29, 28, 27, 48, 32, 31, 30, 48, 35, 34, 33, 48, - 38, 37, 36, 48, 41, 40, 39, 48, 44, 43, 42, 48, 47, 46, 45, 48}; +#ifdef HAS_RAWTOARGBROW_AVX512BW +static const uint32_t kPermdRAWToARGB_AVX512BW[16] = { + 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; -// TODO(fbarchard): optimize this with a mask or vpermb -void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { +void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) { asm volatile( "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 "movabs $0xffffffffffff,%%rax \n" // 48 bytes mask "kmovq %%rax,%%k1 \n" - "vmovdqu8 %3,%%zmm5 \n" + "vmovdqu32 %3,%%zmm5 \n" + "vbroadcasti32x4 %4,%%zmm4 \n" LABELALIGN // "1: \n" @@ -285,10 +282,14 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt "vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n" "vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n" "lea 192(%0),%0 \n" - "vpermb %%zmm0,%%zmm5,%%zmm0 \n" - "vpermb %%zmm1,%%zmm5,%%zmm1 \n" - "vpermb %%zmm2,%%zmm5,%%zmm2 \n" - "vpermb %%zmm3,%%zmm5,%%zmm3 \n" + "vpermd %%zmm0,%%zmm5,%%zmm0 \n" + "vpermd %%zmm1,%%zmm5,%%zmm1 \n" + "vpermd %%zmm2,%%zmm5,%%zmm2 \n" + "vpermd %%zmm3,%%zmm5,%%zmm3 \n" + "vpshufb %%zmm4,%%zmm0,%%zmm0 \n" + "vpshufb %%zmm4,%%zmm1,%%zmm1 \n" + "vpshufb %%zmm4,%%zmm2,%%zmm2 \n" + "vpshufb %%zmm4,%%zmm3,%%zmm3 \n" "vpord %%zmm6,%%zmm0,%%zmm0 \n" "vpord %%zmm6,%%zmm1,%%zmm1 \n" "vpord %%zmm6,%%zmm2,%%zmm2 \n" @@ -301,11 +302,20 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt "sub $0x40,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kPermRAWToARGB_AVX512VBMI) // %3 - : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm5", "zmm6"); + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kPermdRAWToARGB_AVX512BW), // %3 + "m"(*shuffler) // %4 + : "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6"); +} + +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width); +} + +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width); } #endif diff --git a/source/row_win.cc b/source/row_win.cc index e9080d19d..25f3ac9fe 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -261,15 +261,13 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } #endif -#ifdef HAS_RAWTOARGBROW_AVX512VBMI +#ifdef HAS_RAWTOARGBROW_AVX512BW LIBYUV_TARGET_AVX512BW -void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { +void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) { __m512i zmm_alpha = _mm512_set1_epi32(0xff000000); - __m512i zmm_shuf = _mm512_set_epi8( - 48, 45, 46, 47, 48, 42, 43, 44, 48, 39, 40, 41, 48, 36, 37, 38, - 48, 33, 34, 35, 48, 30, 31, 32, 48, 27, 28, 29, 48, 24, 25, 26, - 48, 21, 22, 23, 48, 18, 19, 20, 48, 15, 16, 17, 48, 12, 13, 14, - 48, 9, 10, 11, 48, 6, 7, 8, 48, 3, 4, 5, 48, 0, 1, 2); + __m512i zmm_perm = _mm512_set_epi32( + 12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0); + __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler)); while (width > 0) { __m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw); @@ -277,10 +275,15 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt __m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96); __m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144); - zmm0 = _mm512_permutexvar_epi8(zmm_shuf, zmm0); - zmm1 = _mm512_permutexvar_epi8(zmm_shuf, zmm1); - zmm2 = _mm512_permutexvar_epi8(zmm_shuf, zmm2); - zmm3 = _mm512_permutexvar_epi8(zmm_shuf, zmm3); + zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0); + zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1); + zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2); + zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3); + + zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf); + zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf); + zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf); + zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf); zmm0 = _mm512_or_si512(zmm0, zmm_alpha); zmm1 = _mm512_or_si512(zmm1, zmm_alpha); @@ -297,6 +300,18 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt width -= 64; } } + +LIBYUV_TARGET_AVX512BW +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); + RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width); +} + +LIBYUV_TARGET_AVX512BW +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); + RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width); +} #endif #endif