diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 3a22292b2..b8a778d79 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -398,6 +398,13 @@ extern "C" { #define HAS_ARGBTORGB24ROW_AVX512VBMI #endif +// The following are available for AVX512 clang x64 platforms: +// TODO(fbarchard): Port to x86 +#if !defined(LIBYUV_DISABLE_X86) && \ + defined(__x86_64__) && (defined(CLANG_HAS_AVX512)) +#define HAS_I422TOARGBROW_AVX512BW +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -3027,6 +3034,12 @@ void I422ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3368,6 +3381,12 @@ void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_Any_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 77d4f3bec..b29dfc681 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -7,7 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ - #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" @@ -90,6 +89,14 @@ int I420ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; @@ -321,6 +328,14 @@ int I422ToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; @@ -5142,6 +5157,14 @@ int I420ToRGB565Dither(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 05b816135..c6f9d2250 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -374,6 +374,9 @@ ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif +#ifdef HAS_I422TOARGBROW_AVX512BW +ANY31C(I422ToARGBRow_Any_AVX512BW, I422ToARGBRow_AVX512BW, 1, 0, 4, 31) +#endif #ifdef HAS_I422TORGBAROW_AVX2 ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 1c7f7eaa0..5f7ab0c74 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3181,6 +3181,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" +#define READYUV422_AVX512BW \ + "vmovdqu (%[u_buf]),%%xmm3 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "vpermq %%zmm3,%%zmm16,%%zmm3 \n" \ + "vpermq %%zmm1,%%zmm16,%%zmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ + "vpermq $0xd8,%%zmm3,%%zmm3 \n" \ + "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ + "vmovdqu8 (%[y_buf]),%%ymm4 \n" \ + "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ + "vpermq $0xd8,%%zmm4,%%zmm4 \n" \ + "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + // Read 8 UV from 210, upsample to 16 UV // TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. @@ -3356,6 +3371,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" +// TODO(fbarchard): Remove broadcastb #if defined(__x86_64__) #define YUVTORGB_SETUP_AVX2(yuvconstants) \ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ @@ -3367,6 +3383,24 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" +#define YUVTORGB_SETUP_AVX512BW(yuvconstants) \ + "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ + "movdqa (%[yuvconstants]),%%xmm8 \n" \ + "vpbroadcastq %%xmm8, %%zmm8 \n" \ + "vpsllw $7,%%xmm13,%%xmm13 \n" \ + "vpbroadcastb %%xmm13,%%zmm13 \n" \ + "movq 32(%[yuvconstants]),%%xmm9 \n" \ + "vpbroadcastq %%xmm9,%%zmm9 \n" \ + "movq 64(%[yuvconstants]),%%xmm10 \n" \ + "vpbroadcastq %%xmm10,%%zmm10 \n" \ + "movq 96(%[yuvconstants]),%%xmm11 \n" \ + "vpbroadcastq %%xmm11,%%zmm11 \n" \ + "movq 128(%[yuvconstants]),%%xmm12 \n" \ + "vpbroadcastq %%xmm12,%%zmm12 \n" \ + "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \ + "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \ + "vmovdqu8 (%[unperm]),%%zmm18 \n" + #define YUVTORGB16_AVX2(yuvconstants) \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ @@ -3378,7 +3412,20 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" +#define YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \ + "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \ + "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \ + "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \ + "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \ + "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \ + "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \ + "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \ + "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n" + #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", +#define YUVTORGB_REGS_AVX512BW \ + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm16", "xmm17", "xmm18", #else // Convert 16 pixels: 16 UV and 16 Y. @@ -3413,6 +3460,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" +#define YUVTORGB_AVX512BW(yuvconstants) \ + YUVTORGB16_AVX512BW(yuvconstants) \ + "vpsraw $0x6,%%zmm0,%%zmm0 \n" \ + "vpsraw $0x6,%%zmm1,%%zmm1 \n" \ + "vpsraw $0x6,%%zmm2,%%zmm2 \n" \ + "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" \ + "vpackuswb %%zmm1,%%zmm1,%%zmm1 \n" \ + "vpackuswb %%zmm2,%%zmm2,%%zmm2 \n" + // Store 16 ARGB values. #define STOREARGB_AVX2 \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ @@ -3425,6 +3481,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "lea 0x40(%[dst_argb]), %[dst_argb] \n" +// Store 32 ARGB values. +#define STOREARGB_AVX512BW \ + "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \ + "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \ + "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \ + "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ + "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ + "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ + "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \ + "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \ + "lea 0x80(%[dst_argb]), %[dst_argb] \n" + // Store 16 AR30 values. #define STOREAR30_AVX2 \ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ @@ -3521,6 +3589,50 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TOARGBROW_AVX512BW) +static const uint64_t kSplitQuadWords[8] = {0, 2, 2, 2, 1, 2, 2, 2}; +static const uint64_t kSplitDoubleQuadWords[8] = {0, 1, 4, 4, 2, 3, 4, 4}; +static const uint64_t kUnpermuteAVX512[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 ARGB (128 +// bytes). +void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX512BW(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX512BW + YUVTORGB_AVX512BW(yuvconstants) + STOREARGB_AVX512BW + "sub $0x20,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [quadsplitperm]"r"(kSplitQuadWords), // %[quadsplitperm] + [dquadsplitperm]"r"(kSplitDoubleQuadWords), // %[dquadsplitperm] + [unperm]"r"(kUnpermuteAVX512) // %[unperm] + : "memory", "cc", YUVTORGB_REGS_AVX512BW + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I422TOARGBROW_AVX512BW + #if defined(HAS_I422TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 69e9fac86..5471b2585 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3907,14 +3907,14 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, asm volatile( "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values - "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values + "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 "prfm pldl1keep, [%0, 448] \n" "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" "prfm pldl1keep, [%1, 448] \n" "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" - "subs %w3, %w3, #16 \n" // 16 pixels per loop + "subs %w3, %w3, #16 \n" // 16 pixels per loop "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" : "+r"(src_y), // %0 diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 66b69d8f1..ddad6db7c 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -611,6 +611,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_I422TOARGBROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW | kCpuHasAVX512VL) == (kCpuHasAVX512BW | kCpuHasAVX512VL)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX512BW; + if (IS_ALIGNED(src_width, 32)) { + I422ToARGBRow = I422ToARGBRow_AVX512BW; + } + } +#endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON;