RAWToARGB use AVX512BW

Bug: libyuv:42280902
Change-Id: I7a80fd64d97b6d411316819df0fd917d609a173b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7787163
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2026-04-22 15:20:00 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent bd2c4c76ec
commit 4afb965416
7 changed files with 227 additions and 69 deletions

View File

@ -352,7 +352,8 @@ extern "C" {
defined(LIBYUV_ENABLE_ROWWIN))
#define HAS_RAWTOARGBROW_AVX2
#if defined(__x86_64__) || defined(_M_X64)
#define HAS_RAWTOARGBROW_AVX512VBMI
#define HAS_RAWTOARGBROW_AVX512BW
#define HAS_RGB24TOARGBROW_AVX512BW
#endif
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOYMATRIXROW_AVX2
@ -372,7 +373,8 @@ extern "C" {
!defined(LIBYUV_ENABLE_ROWWIN)
#define HAS_COPYROW_AVX512BW
#if defined(__x86_64__) || defined(_M_X64)
#define HAS_RAWTOARGBROW_AVX512VBMI
#define HAS_RAWTOARGBROW_AVX512BW
#define HAS_RGB24TOARGBROW_AVX512BW
#endif
#define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_CONVERT16TO8ROW_AVX512BW
@ -3980,7 +3982,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@ -4072,7 +4074,9 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);

View File

@ -3077,6 +3077,56 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
}
#endif
#if defined(HAS_RGB24TOARGBROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_LSX;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_LASX;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
RGB24ToARGBRow = RGB24ToARGBRow_RVV;
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@ -3220,6 +3270,56 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
}
#endif
#if defined(HAS_RGB24TOARGBROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
RGB24ToARGBRow = RGB24ToARGBRow_LSX;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_LASX;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
RGB24ToARGBRow = RGB24ToARGBRow_RVV;
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@ -3421,11 +3521,11 @@ int RAWToI420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
@ -3618,11 +3718,11 @@ int RAWToJ420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
@ -3902,11 +4002,11 @@ int RAWToI444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
@ -4121,11 +4221,11 @@ int RAWToJ444(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
@ -4878,11 +4978,19 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
#if defined(HAS_RGB24TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
@ -4891,6 +4999,11 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
RGB24ToARGBRow = RGB24ToARGBRow_SVE2;
}
#endif
#if defined(HAS_RGB24TOARGBROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_LSX;
@ -4912,9 +5025,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
RGB24ToARGBRow = RGB24ToARGBRow_RVV;
}
#endif
{
{
// Allocate 1 row of ARGB.
const int row_size = (width * 4 + 31) & ~31;
align_buffer_64(row, row_size);
@ -5039,11 +5150,11 @@ int RAWToJ400(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif

View File

@ -3638,6 +3638,22 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
@ -3672,8 +3688,7 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
RGB24ToARGBRow = RGB24ToARGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) {
for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
src_rgb24 += src_stride_rgb24;
dst_argb += dst_stride_argb;
@ -3723,11 +3738,11 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif

View File

@ -4184,11 +4184,11 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
if (TestCpuFlag(kCpuHasAVX512VBMI)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512VBMI;
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512VBMI;
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif

View File

@ -1000,8 +1000,11 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#if defined(HAS_RAWTOARGBROW_AVX2)
ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
#endif
#if defined(HAS_RAWTOARGBROW_AVX512VBMI)
ANY11(RAWToARGBRow_Any_AVX512VBMI, RAWToARGBRow_AVX512VBMI, 0, 3, 4, 63)
#if defined(HAS_RAWTOARGBROW_AVX512BW)
ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63)
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63)
#endif
#if defined(HAS_RAWTORGBAROW_SSSE3)
ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)

View File

@ -262,21 +262,18 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_RAWTOARGBROW_AVX512VBMI
static const uint8_t kPermRAWToARGB_AVX512VBMI[64] = {
2, 1, 0, 48, 5, 4, 3, 48, 8, 7, 6, 48, 11, 10, 9, 48,
14, 13, 12, 48, 17, 16, 15, 48, 20, 19, 18, 48, 23, 22, 21, 48,
26, 25, 24, 48, 29, 28, 27, 48, 32, 31, 30, 48, 35, 34, 33, 48,
38, 37, 36, 48, 41, 40, 39, 48, 44, 43, 42, 48, 47, 46, 45, 48};
#ifdef HAS_RAWTOARGBROW_AVX512BW
static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
// TODO(fbarchard): optimize this with a mask or vpermb
void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
asm volatile(
"vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff
"vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000
"movabs $0xffffffffffff,%%rax \n" // 48 bytes mask
"kmovq %%rax,%%k1 \n"
"vmovdqu8 %3,%%zmm5 \n"
"vmovdqu32 %3,%%zmm5 \n"
"vbroadcasti32x4 %4,%%zmm4 \n"
LABELALIGN //
"1: \n"
@ -285,10 +282,14 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt
"vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n"
"vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n"
"lea 192(%0),%0 \n"
"vpermb %%zmm0,%%zmm5,%%zmm0 \n"
"vpermb %%zmm1,%%zmm5,%%zmm1 \n"
"vpermb %%zmm2,%%zmm5,%%zmm2 \n"
"vpermb %%zmm3,%%zmm5,%%zmm3 \n"
"vpermd %%zmm0,%%zmm5,%%zmm0 \n"
"vpermd %%zmm1,%%zmm5,%%zmm1 \n"
"vpermd %%zmm2,%%zmm5,%%zmm2 \n"
"vpermd %%zmm3,%%zmm5,%%zmm3 \n"
"vpshufb %%zmm4,%%zmm0,%%zmm0 \n"
"vpshufb %%zmm4,%%zmm1,%%zmm1 \n"
"vpshufb %%zmm4,%%zmm2,%%zmm2 \n"
"vpshufb %%zmm4,%%zmm3,%%zmm3 \n"
"vpord %%zmm6,%%zmm0,%%zmm0 \n"
"vpord %%zmm6,%%zmm1,%%zmm1 \n"
"vpord %%zmm6,%%zmm2,%%zmm2 \n"
@ -301,11 +302,20 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt
"sub $0x40,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kPermRAWToARGB_AVX512VBMI) // %3
: "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm5", "zmm6");
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kPermdRAWToARGB_AVX512BW), // %3
"m"(*shuffler) // %4
: "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6");
}
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width);
}
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width);
}
#endif

View File

@ -261,15 +261,13 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
#endif
#ifdef HAS_RAWTOARGBROW_AVX512VBMI
#ifdef HAS_RAWTOARGBROW_AVX512BW
LIBYUV_TARGET_AVX512BW
void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
__m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
__m512i zmm_shuf = _mm512_set_epi8(
48, 45, 46, 47, 48, 42, 43, 44, 48, 39, 40, 41, 48, 36, 37, 38,
48, 33, 34, 35, 48, 30, 31, 32, 48, 27, 28, 29, 48, 24, 25, 26,
48, 21, 22, 23, 48, 18, 19, 20, 48, 15, 16, 17, 48, 12, 13, 14,
48, 9, 10, 11, 48, 6, 7, 8, 48, 3, 4, 5, 48, 0, 1, 2);
__m512i zmm_perm = _mm512_set_epi32(
12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
__m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));
while (width > 0) {
__m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw);
@ -277,10 +275,15 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt
__m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96);
__m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144);
zmm0 = _mm512_permutexvar_epi8(zmm_shuf, zmm0);
zmm1 = _mm512_permutexvar_epi8(zmm_shuf, zmm1);
zmm2 = _mm512_permutexvar_epi8(zmm_shuf, zmm2);
zmm3 = _mm512_permutexvar_epi8(zmm_shuf, zmm3);
zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0);
zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1);
zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2);
zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3);
zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf);
zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf);
zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf);
zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf);
zmm0 = _mm512_or_si512(zmm0, zmm_alpha);
zmm1 = _mm512_or_si512(zmm1, zmm_alpha);
@ -297,6 +300,18 @@ void RAWToARGBRow_AVX512VBMI(const uint8_t* src_raw, uint8_t* dst_argb, int widt
width -= 64;
}
}
LIBYUV_TARGET_AVX512BW
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
__m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
}
LIBYUV_TARGET_AVX512BW
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
__m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
}
#endif
#endif