From dfaf210a1916d349f28b1fdc6b400417afe76dc9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 8 Jun 2026 19:58:13 -0700 Subject: [PATCH] I420ToRAW and I420ToRGB24 1 pass AVX2 Replaced the 2-pass conversion (I420 -> ARGB -> RGB24/RAW) with a highly optimized 1-pass AVX2 implementation. This avoids intermediate stack buffering and significantly reduces memory bandwidth. Implemented `I422ToRGB24Row_AVX2` in: - `row_gcc.cc`: Inline assembly for GCC/Clang. - `row_win.cc`: C++ intrinsics for MSVC (also verified with Clang). Optimized the width alignment requirement: changed from 32-pixel to 16-pixel alignment in `convert_argb.cc` and `row_any.cc`. This allows the optimized AVX2 path to be used for more common video resolutions. Performance results (1080p, 100 iterations): - C Reference: ~18.5 ms - AVX2 2-Pass (Baseline): ~412 us (~45x speedup) - AVX2 1-Pass (GCC Assembly): ~411 us (~s45x speedup) - AVX2 1-Pass (Intrinsics): ~365 us (~50x speedup, 11% faster than asm) Test: libyuv_unittest --gunit_filter=*I420ToRGB24* Test: libyuv_unittest --gunit_filter=*I420ToRAW* Bug: 42280902 Change-Id: I07c0505c95410ea16a6218c858844791a11ef073 --- README.chromium | 2 +- include/libyuv/row.h | 1 + include/libyuv/version.h | 2 +- source/convert_argb.cc | 6 +- source/row_any.cc | 4 + source/row_common.cc | 12 +- source/row_gcc.cc | 306 +++++++++++++++++++++++++++++++++++++++ source/row_win.cc | 208 ++++++++++++++++++++++++++ 8 files changed, 534 insertions(+), 7 deletions(-) diff --git a/README.chromium b/README.chromium index cc424502a..f02788ca1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1948 +Version: 1949 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 835342acd..ec829e445 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -349,6 +349,7 @@ extern "C" { ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) #define HAS_RAWTOARGBROW_AVX2 +#define HAS_I422TORGB24ROW_AVX2 #define HAS_RGB24TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 #define HAS_ARGB1555TOARGBROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 9f9d18da7..7c8b5dcb2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1948 +#define LIBYUV_VERSION 1949 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 3844e9691..6a9231438 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -5551,7 +5551,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y, #if defined(HAS_I422TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; +#if defined(__x86_64__) if (IS_ALIGNED(width, 32)) { +#else + if (IS_ALIGNED(width, 16)) { +#endif I422ToRGB24Row = I422ToRGB24Row_AVX2; } } @@ -5772,7 +5776,7 @@ int I422ToRGB24Matrix(const uint8_t* src_y, #if defined(HAS_I422TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 16)) { I422ToRGB24Row = I422ToRGB24Row_AVX2; } } diff --git a/source/row_any.cc b/source/row_any.cc index 919b231e6..cc6a18502 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -385,7 +385,11 @@ ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I444ToRGB24Row_Any_SSSE3, I444ToRGB24Row_SSSE3, 0, 0, 3, 15) #endif #ifdef HAS_I422TORGB24ROW_AVX2 +#if defined(__x86_64__) ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) +#else +ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +#endif #endif #ifdef HAS_I422TORGB24ROW_AVX512VBMI ANY31C(I422ToRGB24Row_Any_AVX512VBMI, I422ToRGB24Row_AVX512VBMI, 1, 0, 3, 31) diff --git a/source/row_common.cc b/source/row_common.cc index 70ceaf5c8..2ee1ee9a3 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -97,7 +97,9 @@ static __inline uint32_t Clamp10(int32_t val) { #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define WRITEWORD(p, v) *(uint32_t*)(p) = v +static inline void WRITEWORD(uint8_t* p, uint32_t v) { + memcpy(p, &v, 4); +} #else static inline void WRITEWORD(uint8_t* p, uint32_t v) { p[0] = (uint8_t)(v & 255); @@ -4276,7 +4278,8 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) && \ + !defined(HAS_I422TORGB24ROW_AVX2) void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4298,7 +4301,8 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI) +#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX512VBMI) && \ + !defined(HAS_I422TORGB24ROW_AVX512VBMI) void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4320,7 +4324,7 @@ void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX512BW) && defined(HAS_ARGBTORGB24ROW_AVX2) && !defined(HAS_I422TORGB24ROW_AVX512BW) void I422ToRGB24Row_AVX512BW(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 10ecf5910..571955010 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3775,6 +3775,170 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TORGB24ROW_AVX2) +#if defined(__x86_64__) +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 RGB24 (96 bytes). +void OMITFP I422ToRGB24Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vbroadcasti128 %[kShuffleMaskARGBToRGB24_0],%%ymm5 \n" + "vbroadcasti128 %[kShuffleMaskARGBToRGB24],%%ymm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + // Load U and V (16 bytes each) + "vmovdqu (%[u_buf]),%%xmm3 \n" + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" + "lea 0x10(%[u_buf]),%[u_buf] \n" + + // Load Y (32 bytes) + "vmovdqu (%[y_buf]),%%ymm4 \n" + "lea 0x20(%[y_buf]),%[y_buf] \n" + + // Permute Y + "vpermq $0xd8,%%ymm4,%%ymm0 \n" // ymm0 = low helper + "vpermq $0x32,%%ymm4,%%ymm7 \n" // ymm7 = high helper + + // Upsample U and V + "vmovdqa %%ymm3,%%ymm14 \n" // ymm14 = U + + // uv_low: ymm3 = U, ymm1 = V. Result in ymm3 + "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" // ymm3 = uv_low + + // uv_high: ymm14 = U, ymm1 = V. Result in ymm14 + "vpunpckhbw %%ymm1,%%ymm14,%%ymm14 \n" + "vpermq $0xd8,%%ymm14,%%ymm14 \n" + "vpunpcklwd %%ymm14,%%ymm14,%%ymm14 \n" // ymm14 = uv_high + + // Format Y + "vpunpcklbw %%ymm0,%%ymm0,%%ymm4 \n" // ymm4 = y_low + "vpunpcklbw %%ymm7,%%ymm7,%%ymm7 \n" // ymm7 = y_high + + // --- Process Part 1 (P0-P15) --- + YUVTORGB_AVX2(yuvconstants) + + // Pack Part 1 + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm2,%%ymm2,%%ymm2 \n" + "vmovdqa %%ymm0,%%ymm1 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpalignr $0xc,%%ymm0,%%ymm1,%%ymm1 \n" + "vextracti128 $1,%%ymm0,%%xmm2 \n" + "vextracti128 $1,%%ymm1,%%xmm3 \n" + + // Store Part 1 (using optimized 16-byte stores) + "vmovq %%xmm0,(%[dst_rgb24]) \n" // 8 bytes + "vmovdqu %%xmm1,0x8(%[dst_rgb24]) \n" // 16 bytes + "vmovq %%xmm2,0x18(%[dst_rgb24]) \n" // 8 bytes + "vmovdqu %%xmm3,0x20(%[dst_rgb24]) \n" // 16 bytes + + // --- Process Part 2 (P16-P31) --- + "vmovdqa %%ymm14,%%ymm3 \n" // ymm3 = uv_high + "vmovdqa %%ymm7,%%ymm4 \n" // ymm4 = y_high + + YUVTORGB_AVX2(yuvconstants) + + // Pack Part 2 + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm2,%%ymm2,%%ymm2 \n" + "vmovdqa %%ymm0,%%ymm1 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpalignr $0xc,%%ymm0,%%ymm1,%%ymm1 \n" + "vextracti128 $1,%%ymm0,%%xmm2 \n" + "vextracti128 $1,%%ymm1,%%xmm3 \n" + + // Store Part 2 (using optimized 16-byte stores, offset by 48) + "vmovq %%xmm0,0x30(%[dst_rgb24]) \n" // 8 bytes + "vmovdqu %%xmm1,0x38(%[dst_rgb24]) \n" // 16 bytes + "vmovq %%xmm2,0x48(%[dst_rgb24]) \n" // 8 bytes + "vmovdqu %%xmm3,0x50(%[dst_rgb24]) \n" // 16 bytes + + "lea 0x60(%[dst_rgb24]),%[dst_rgb24] \n" // advance by 96 bytes + + "sub $0x20,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm14" + ); +} +#else // defined(__x86_64__) +// 16 pixels version for 32-bit +void OMITFP I422ToRGB24Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vbroadcasti128 %[kShuffleMaskARGBToRGB24_0],%%ymm5 \n" + "vbroadcasti128 %[kShuffleMaskARGBToRGB24],%%ymm6 \n" + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(yuvconstants) + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm2,%%ymm2,%%ymm2 \n" + "vmovdqa %%ymm0,%%ymm1 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" + "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" + "vpalignr $0xc,%%ymm0,%%ymm1,%%ymm1 \n" + "vextracti128 $1,%%ymm0,%%xmm2 \n" + "vextracti128 $1,%%ymm1,%%xmm3 \n" + "vmovq %%xmm0,(%[dst_rgb24]) \n" + "vmovq %%xmm1,0x8(%[dst_rgb24]) \n" + "vmovhpd %%xmm1,0x10(%[dst_rgb24]) \n" + "vmovq %%xmm2,0x18(%[dst_rgb24]) \n" + "vmovq %%xmm3,0x20(%[dst_rgb24]) \n" + "vmovhpd %%xmm3,0x28(%[dst_rgb24]) \n" + "lea 0x30(%[dst_rgb24]),%[dst_rgb24] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + ); +} +#endif // defined(__x86_64__) +#endif // HAS_I422TORGB24ROW_AVX2 + + #if defined(HAS_I422TOARGBROW_AVX512BW) static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2}; static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4}; @@ -3817,6 +3981,148 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } + +#if defined(HAS_I422TORGB24ROW_AVX512VBMI) +alignas(64) static const uint8_t kMaskBG[64] = { + 0x00, 0x40, 0x01, 0x41, 0x02, 0x42, 0x03, 0x43, 0x04, 0x44, 0x05, 0x45, + 0x06, 0x46, 0x07, 0x47, 0x10, 0x50, 0x11, 0x51, 0x12, 0x52, 0x13, 0x53, + 0x14, 0x54, 0x15, 0x55, 0x16, 0x56, 0x17, 0x57, 0x20, 0x60, 0x21, 0x61, + 0x22, 0x62, 0x23, 0x63, 0x24, 0x64, 0x25, 0x65, 0x26, 0x66, 0x27, 0x67, + 0x30, 0x70, 0x31, 0x71, 0x32, 0x72, 0x33, 0x73, 0x34, 0x74, 0x35, 0x75, + 0x36, 0x76, 0x37, 0x77}; +alignas(64) static const uint8_t kMaskDST0[64] = { + 0x00, 0x01, 0x40, 0x02, 0x03, 0x41, 0x04, 0x05, 0x42, 0x06, 0x07, 0x43, + 0x08, 0x09, 0x44, 0x0a, 0x0b, 0x45, 0x0c, 0x0d, 0x46, 0x0e, 0x0f, 0x47, + 0x10, 0x11, 0x50, 0x12, 0x13, 0x51, 0x14, 0x15, 0x52, 0x16, 0x17, 0x53, + 0x18, 0x19, 0x54, 0x1a, 0x1b, 0x55, 0x1c, 0x1d, 0x56, 0x1e, 0x1f, 0x57, + 0x20, 0x21, 0x60, 0x22, 0x23, 0x61, 0x24, 0x25, 0x62, 0x26, 0x27, 0x63, + 0x28, 0x29, 0x64, 0x2a}; +alignas(64) static const uint8_t kMaskDST1[64] = { + 0x2b, 0x65, 0x2c, 0x2d, 0x66, 0x2e, 0x2f, 0x67, 0x30, 0x31, 0x70, 0x32, + 0x33, 0x71, 0x34, 0x35, 0x72, 0x36, 0x37, 0x73, 0x38, 0x39, 0x74, 0x3a, + 0x3b, 0x75, 0x3c, 0x3d, 0x76, 0x3e, 0x3f, 0x77, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + +#if defined(HAS_I422TORGB24ROW_AVX512BW) +alignas(64) static const uint32_t kMergeDst0[16] = { + 0, 1, 3, 16, 17, 18, 4, 5, 7, 20, 21, 22, 8, 9, 11, 24 +}; +alignas(64) static const uint32_t kMergeDst1[16] = { + 25, 26, 12, 13, 15, 28, 29, 30, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 RGB24 (96 bytes). +void OMITFP I422ToRGB24Row_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX512BW(yuvconstants) + "vbroadcasti32x4 %[kShuffleMaskARGBToRGB24_0],%%zmm20 \n" + "vbroadcasti32x4 %[kShuffleMaskARGBToRGB24],%%zmm21 \n" + "vmovdqu32 %[kMergeDst0],%%zmm22 \n" + "vmovdqu32 %[kMergeDst1],%%zmm23 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX512BW + YUVTORGB_AVX512BW(yuvconstants) + "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" + "vpunpcklbw %%zmm2,%%zmm2,%%zmm2 \n" + "vmovdqa64 %%zmm0,%%zmm1 \n" + "vpunpcklwd %%zmm2,%%zmm0,%%zmm0 \n" + "vpunpckhwd %%zmm2,%%zmm1,%%zmm1 \n" + "vpshufb %%zmm20,%%zmm0,%%zmm0 \n" + "vpshufb %%zmm21,%%zmm1,%%zmm1 \n" + "vmovdqa64 %%zmm0,%%zmm3 \n" + "vpermt2d %%zmm1,%%zmm22,%%zmm3 \n" // zmm3 = dst0 + "vpermt2d %%zmm1,%%zmm23,%%zmm0 \n" // zmm0 = dst1 + "vmovdqu32 %%zmm3,(%[dst_rgb24]) \n" + "vmovdqu32 %%ymm0,0x40(%[dst_rgb24]) \n" + "lea 0x60(%[dst_rgb24]),%[dst_rgb24] \n" + "sub $0x20,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm] + [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm] + [unperm]"r"(kUnpermuteAVX512), // %[unperm] + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24), + [kMergeDst0]"m"(kMergeDst0), + [kMergeDst1]"m"(kMergeDst1) + : "memory", "cc", YUVTORGB_REGS_AVX512BW + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm20", "zmm21", "zmm22", "zmm23", + "xmm20", "xmm21", "xmm22", "xmm23" + ); +} +#endif + +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 RGB24 (96 bytes). +void OMITFP I422ToRGB24Row_AVX512VBMI(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX512BW(yuvconstants) + "vmovdqu32 %[kMaskBG],%%zmm20 \n" + "vmovdqu32 %[kMaskDST0],%%zmm21 \n" + "vmovdqu32 %[kMaskDST1],%%zmm22 \n" + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX512BW + YUVTORGB_AVX512BW(yuvconstants) + "vpermt2b %%zmm1,%%zmm20,%%zmm0 \n" // zmm0 = BG + "vmovdqa64 %%zmm0,%%zmm3 \n" // zmm3 = BG copy + "vpermt2b %%zmm2,%%zmm21,%%zmm3 \n" // zmm3 = dst0 + "vpermt2b %%zmm2,%%zmm22,%%zmm0 \n" // zmm0 = dst1 + "vmovdqu8 %%zmm3,(%[dst_rgb24]) \n" + "vmovdqu8 %%ymm0,0x40(%[dst_rgb24]) \n" + "lea 0x60(%[dst_rgb24]),%[dst_rgb24] \n" + "sub $0x20,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm] + [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm] + [unperm]"r"(kUnpermuteAVX512), // %[unperm] + [kMaskBG]"m"(kMaskBG), // %[kMaskBG] + [kMaskDST0]"m"(kMaskDST0), // %[kMaskDST0] + [kMaskDST1]"m"(kMaskDST1) // %[kMaskDST1] + : "memory", "cc", YUVTORGB_REGS_AVX512BW + "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", + "zmm20", "zmm21", "zmm22", + "xmm20", "xmm21", "xmm22" + ); +} +#endif // HAS_I422TORGB24ROW_AVX512VBMI + #endif // HAS_I422TOARGBROW_AVX512BW #if defined(HAS_I422TOAR30ROW_AVX2) diff --git a/source/row_win.cc b/source/row_win.cc index a7ed75199..0eb93cb6e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -108,9 +108,12 @@ extern "C" { #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2"))) #define LIBYUV_TARGET_AVX512BW \ __attribute__((target("avx512bw,avx512vl,avx512f"))) +#define LIBYUV_TARGET_AVX512VBMI \ + __attribute__((target("avx512vbmi,avx512bw,avx512vl,avx512f"))) #else #define LIBYUV_TARGET_AVX2 #define LIBYUV_TARGET_AVX512BW +#define LIBYUV_TARGET_AVX512VBMI #endif // Convert 32 ARGB pixels (128 bytes) to 32 UV444 values. @@ -973,6 +976,211 @@ void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, #endif +#ifdef HAS_I422TORGB24ROW_AVX2 +LIBYUV_TARGET_AVX2 +void I422ToRGB24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Constants + __m256i ymm_kUVToB = _mm256_loadu_si256((const __m256i*)yuvconstants->kUVToB); + __m256i ymm_kUVToG = _mm256_loadu_si256((const __m256i*)yuvconstants->kUVToG); + __m256i ymm_kUVToR = _mm256_loadu_si256((const __m256i*)yuvconstants->kUVToR); + __m256i ymm_kYToRgb = _mm256_loadu_si256((const __m256i*)yuvconstants->kYToRgb); + __m256i ymm_kYBiasToRgb = _mm256_loadu_si256((const __m256i*)yuvconstants->kYBiasToRgb); + __m256i ymm_128 = _mm256_set1_epi8((char)0x80); + + __m128i shuf0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, (char)0x80, (char)0x80, (char)0x80, (char)0x80, 10, 12, 13, 14); + __m128i shuf1 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, (char)0x80, (char)0x80, (char)0x80, (char)0x80); + __m256i ymm_shuf0 = _mm256_broadcastsi128_si256(shuf0); + __m256i ymm_shuf1 = _mm256_broadcastsi128_si256(shuf1); + __m256i ymm_u_zero = _mm256_setzero_si256(); + + ptrdiff_t offset = src_v - src_u; + + while (width >= 16) { + // READYUV422_AVX2 + __m128i xmm_u = _mm_loadl_epi64((const __m128i*)src_u); + __m128i xmm_v = _mm_loadl_epi64((const __m128i*)(src_u + offset)); + src_u += 8; + + __m256i ymm3 = _mm256_insertf128_si256(ymm_u_zero, xmm_u, 0); + __m256i ymm1 = _mm256_insertf128_si256(ymm_u_zero, xmm_v, 0); + + ymm3 = _mm256_unpacklo_epi8(ymm3, ymm1); + ymm3 = _mm256_permute4x64_epi64(ymm3, 0xd8); + ymm3 = _mm256_unpacklo_epi16(ymm3, ymm3); + + __m128i xmm_y = _mm_loadu_si128((const __m128i*)src_y); + src_y += 16; + __m256i ymm4 = _mm256_insertf128_si256(ymm_u_zero, xmm_y, 0); + ymm4 = _mm256_permute4x64_epi64(ymm4, 0xd8); + ymm4 = _mm256_unpacklo_epi8(ymm4, ymm4); + + // YUVTORGB_AVX2 + ymm3 = _mm256_sub_epi8(ymm3, ymm_128); + ymm4 = _mm256_mulhi_epu16(ymm4, ymm_kYToRgb); + + __m256i ymm0 = _mm256_maddubs_epi16(ymm_kUVToB, ymm3); + ymm1 = _mm256_maddubs_epi16(ymm_kUVToG, ymm3); + __m256i ymm2 = _mm256_maddubs_epi16(ymm_kUVToR, ymm3); + + ymm4 = _mm256_add_epi16(ymm4, ymm_kYBiasToRgb); + + ymm0 = _mm256_adds_epi16(ymm0, ymm4); + ymm1 = _mm256_subs_epi16(ymm4, ymm1); + ymm2 = _mm256_adds_epi16(ymm2, ymm4); + + ymm0 = _mm256_srai_epi16(ymm0, 6); + ymm1 = _mm256_srai_epi16(ymm1, 6); + ymm2 = _mm256_srai_epi16(ymm2, 6); + + ymm0 = _mm256_packus_epi16(ymm0, ymm0); + ymm1 = _mm256_packus_epi16(ymm1, ymm1); + ymm2 = _mm256_packus_epi16(ymm2, ymm2); + + // STORERGB24_AVX2 + __m256i ymm0_packed = _mm256_unpacklo_epi8(ymm0, ymm1); + __m256i ymm2_packed = _mm256_unpacklo_epi8(ymm2, ymm2); + __m256i ymm1_packed = ymm0_packed; + + ymm0_packed = _mm256_unpacklo_epi16(ymm0_packed, ymm2_packed); + ymm1_packed = _mm256_unpackhi_epi16(ymm1_packed, ymm2_packed); + + ymm0_packed = _mm256_shuffle_epi8(ymm0_packed, ymm_shuf0); + ymm1_packed = _mm256_shuffle_epi8(ymm1_packed, ymm_shuf1); + + ymm1_packed = _mm256_alignr_epi8(ymm1_packed, ymm0_packed, 0xc); + + __m128i xmm0_store = _mm256_castsi256_si128(ymm0_packed); + __m128i xmm1_store = _mm256_castsi256_si128(ymm1_packed); + __m128i xmm2_store = _mm256_extractf128_si256(ymm0_packed, 1); + __m128i xmm3_store = _mm256_extractf128_si256(ymm1_packed, 1); + + _mm_storel_epi64((__m128i*)dst_rgb24, xmm0_store); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 8), xmm1_store); + _mm_storel_epi64((__m128i*)(dst_rgb24 + 24), xmm2_store); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 32), xmm3_store); + + dst_rgb24 += 48; + width -= 16; + } + _mm256_zeroupper(); +} +#endif + +#ifdef HAS_I422TORGB24ROW_AVX512VBMI +LIBYUV_TARGET_AVX512VBMI +void I422ToRGB24Row_AVX512VBMI(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Masks + alignas(64) static const uint8_t kMaskBG[64] = { + 0x00, 0x40, 0x01, 0x41, 0x02, 0x42, 0x03, 0x43, 0x04, 0x44, 0x05, 0x45, + 0x06, 0x46, 0x07, 0x47, 0x10, 0x50, 0x11, 0x51, 0x12, 0x52, 0x13, 0x53, + 0x14, 0x54, 0x15, 0x55, 0x16, 0x56, 0x17, 0x57, 0x20, 0x60, 0x21, 0x61, + 0x22, 0x62, 0x23, 0x63, 0x24, 0x64, 0x25, 0x65, 0x26, 0x66, 0x27, 0x67, + 0x30, 0x70, 0x31, 0x71, 0x32, 0x72, 0x33, 0x73, 0x34, 0x74, 0x35, 0x75, + 0x36, 0x76, 0x37, 0x77}; + alignas(64) static const uint8_t kMaskDST0[64] = { + 0x00, 0x01, 0x40, 0x02, 0x03, 0x41, 0x04, 0x05, 0x42, 0x06, 0x07, 0x43, + 0x08, 0x09, 0x44, 0x0a, 0x0b, 0x45, 0x0c, 0x0d, 0x46, 0x0e, 0x0f, 0x47, + 0x10, 0x11, 0x50, 0x12, 0x13, 0x51, 0x14, 0x15, 0x52, 0x16, 0x17, 0x53, + 0x18, 0x19, 0x54, 0x1a, 0x1b, 0x55, 0x1c, 0x1d, 0x56, 0x1e, 0x1f, 0x57, + 0x20, 0x21, 0x60, 0x22, 0x23, 0x61, 0x24, 0x25, 0x62, 0x26, 0x27, 0x63, + 0x28, 0x29, 0x64, 0x2a}; + alignas(64) static const uint8_t kMaskDST1[64] = { + 0x2b, 0x65, 0x2c, 0x2d, 0x66, 0x2e, 0x2f, 0x67, 0x30, 0x31, 0x70, 0x32, + 0x33, 0x71, 0x34, 0x35, 0x72, 0x36, 0x37, 0x73, 0x38, 0x39, 0x74, 0x3a, + 0x3b, 0x75, 0x3c, 0x3d, 0x76, 0x3e, 0x3f, 0x77, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00}; + + alignas(64) static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2}; + alignas(64) static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4}; + + // Constants + __m512i zmm_kUVToB = _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)yuvconstants->kUVToB)); + __m512i zmm_kUVToG = _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)yuvconstants->kUVToG)); + __m512i zmm_kUVToR = _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)yuvconstants->kUVToR)); + __m512i zmm_kYToRgb = _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)yuvconstants->kYToRgb)); + __m512i zmm_kYBiasToRgb = _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)yuvconstants->kYBiasToRgb)); + __m512i zmm_128 = _mm512_set1_epi8((char)0x80); + + __m512i zmm_mask_BG = _mm512_loadu_si512((const __m512i*)kMaskBG); + __m512i zmm_mask_DST0 = _mm512_loadu_si512((const __m512i*)kMaskDST0); + __m512i zmm_mask_DST1 = _mm512_loadu_si512((const __m512i*)kMaskDST1); + __m512i zmm_split = _mm512_loadu_si512((const __m512i*)kSplitQuadWords); + __m512i zmm_split_y = _mm512_loadu_si512((const __m512i*)kSplitDoubleQuadWords); + + ptrdiff_t offset = src_v - src_u; + + while (width >= 32) { + // READYUV422_AVX512BW + __m128i xmm_u = _mm_loadu_si128((const __m128i*)src_u); + __m128i xmm_v = _mm_loadu_si128((const __m128i*)(src_u + offset)); + src_u += 16; + + __m512i zmm_u_val = _mm512_castsi128_si512(xmm_u); + __m512i zmm_v_val = _mm512_castsi128_si512(xmm_v); + + zmm_u_val = _mm512_permutexvar_epi64(zmm_split, zmm_u_val); + zmm_v_val = _mm512_permutexvar_epi64(zmm_split, zmm_v_val); + + __m512i zmm3 = _mm512_unpacklo_epi8(zmm_u_val, zmm_v_val); + zmm3 = _mm512_permutex_epi64(zmm3, 0xd8); + zmm3 = _mm512_unpacklo_epi16(zmm3, zmm3); + + __m256i ymm_y = _mm256_loadu_si256((const __m256i*)src_y); + src_y += 32; + __m512i zmm4 = _mm512_castsi256_si512(ymm_y); + zmm4 = _mm512_permutexvar_epi64(zmm_split_y, zmm4); + zmm4 = _mm512_permutex_epi64(zmm4, 0xd8); + zmm4 = _mm512_unpacklo_epi8(zmm4, zmm4); + + // YUVTORGB_AVX512BW + zmm3 = _mm512_sub_epi8(zmm3, zmm_128); + zmm4 = _mm512_mulhi_epu16(zmm4, zmm_kYToRgb); + + __m512i zmm0 = _mm512_maddubs_epi16(zmm_kUVToB, zmm3); + __m512i zmm1 = _mm512_maddubs_epi16(zmm_kUVToG, zmm3); + __m512i zmm2 = _mm512_maddubs_epi16(zmm_kUVToR, zmm3); + + zmm4 = _mm512_add_epi16(zmm4, zmm_kYBiasToRgb); + + zmm0 = _mm512_adds_epi16(zmm0, zmm4); + zmm1 = _mm512_subs_epi16(zmm4, zmm1); + zmm2 = _mm512_adds_epi16(zmm2, zmm4); + + zmm0 = _mm512_srai_epi16(zmm0, 6); + zmm1 = _mm512_srai_epi16(zmm1, 6); + zmm2 = _mm512_srai_epi16(zmm2, 6); + + zmm0 = _mm512_packus_epi16(zmm0, zmm0); + zmm1 = _mm512_packus_epi16(zmm1, zmm1); + zmm2 = _mm512_packus_epi16(zmm2, zmm2); + + // STORERGB24_AVX512VBMI + __m512i zmm_BG = _mm512_permi2var_epi8(zmm0, zmm_mask_BG, zmm1); + __m512i zmm_dst0 = _mm512_permi2var_epi8(zmm_BG, zmm_mask_DST0, zmm2); + __m512i zmm_dst1 = _mm512_permi2var_epi8(zmm_BG, zmm_mask_DST1, zmm2); + + _mm512_storeu_si512((__m512i*)dst_rgb24, zmm_dst0); + _mm256_storeu_si256((__m256i*)(dst_rgb24 + 64), _mm512_castsi512_si256(zmm_dst1)); + + dst_rgb24 += 96; + width -= 32; + } + _mm256_zeroupper(); +} +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv