diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4d8583271..a93658f92 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -279,6 +279,7 @@ extern "C" { #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 +#define HAS_I210TOARGBROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #endif @@ -1850,6 +1851,12 @@ void I210ToARGBRow_SSSE3(const uint16* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToARGBRow_AVX2(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2000,6 +2007,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToARGBRow_Any_AVX2(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 9b93fc151..292010831 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -448,6 +448,14 @@ static int H010ToAR30Matrix(const uint16* src_y, } } #endif +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; @@ -537,7 +545,14 @@ static int I010ToARGBMatrix(const uint16* src_y, } } #endif - +#if defined(HAS_I210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210ToARGBRow = I210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210ToARGBRow = I210ToARGBRow_AVX2; + } + } +#endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; diff --git a/source/row_any.cc b/source/row_any.cc index 7e557d421..9f4725bf5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -194,9 +194,8 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif #undef ANY31C -// 64 byte per row for future AVX2 // Any 3 planes of 16 bit to 1 with yuvconstants -// TODO(fbarchard): consider +// TODO(fbarchard): consider sharing this code with ANY31C #define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ @@ -218,6 +217,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #ifdef HAS_I210TOARGBROW_SSSE3 ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7) #endif +#ifdef HAS_I210TOARGBROW_AVX2 +ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16, 2, 4, 15) +#endif #undef ANY31CT // Any 2 planes to 1. diff --git a/source/row_gcc.cc b/source/row_gcc.cc index bcf93c701..1d486ee36 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1627,7 +1627,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, // TODO(fbarchard): Consider shufb to replace pack/unpack // TODO(fbarchard): Consider pmulhuw to replace psraw // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. -#define READYUV422_10 \ +#define READYUV210 \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ @@ -1637,7 +1637,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, "punpcklwd %%xmm0,%%xmm0 \n" \ "movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \ - "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ @@ -1892,7 +1892,7 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf, LABELALIGN "1: \n" - READYUV422_10 + READYUV210 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" @@ -1968,7 +1968,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", YUVTORGB_REGS // Does not use r14. + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on @@ -2116,6 +2116,23 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" +// Read 8 UV from 210 10 bit, upsample to 16 UV +// TODO(fbarchard): Consider vshufb to replace pack/unpack +// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. +#define READYUV210_AVX2 \ + "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%ymm4 \n" \ + "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "lea " MEMLEA(0x20, [y_buf]) ",%[y_buf] \n" + // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ @@ -2308,6 +2325,41 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I210TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I210ToARGBRow_AVX2(const uint16* y_buf, + const uint16* u_buf, + const uint16* v_buf, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + #if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.