diff --git a/README.chromium b/README.chromium index 5c571e2bc..6294ca9dd 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1692 +Version: 1693 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7f5fe9f0f..9c8908e59 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -272,6 +272,7 @@ extern "C" { #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I422TOAR30ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #endif @@ -1905,6 +1906,12 @@ void I210ToARGBRow_SSSE3(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_AVX2(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, @@ -2073,6 +2080,12 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_Any_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_Any_AVX2(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index dc96d55e7..6b40f653a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1692 +#define LIBYUV_VERSION 1693 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_any.cc b/source/row_any.cc index 2b3b85272..f6bdfaa0f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -148,6 +148,9 @@ ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #ifdef HAS_I422TOAR30ROW_SSSE3 ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) #endif +#ifdef HAS_I422TOAR30ROW_AVX2 +ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) +#endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 656801ac7..d817556f5 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2274,7 +2274,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vmovdqa 160(%[yuvconstants]),%%ymm13 \n" \ "vmovdqa 192(%[yuvconstants]),%%ymm14 \n" -#define YUVTORGB_AVX2(yuvconstants) \ +#define YUVTORGB16_AVX2(yuvconstants) \ "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \ "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \ "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \ @@ -2284,13 +2284,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ - "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ - "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ - "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ - "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 \ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", @@ -2298,7 +2292,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) -#define YUVTORGB_AVX2(yuvconstants) \ +#define YUVTORGB16_AVX2(yuvconstants) \ "vpmaddubsw 64(%[yuvconstants]),%%ymm0,%%ymm2 \n" \ "vpmaddubsw 32(%[yuvconstants]),%%ymm0,%%ymm1 \n" \ "vpmaddubsw (%[yuvconstants]),%%ymm0,%%ymm0 \n" \ @@ -2311,15 +2305,18 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpmulhuw 192(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ - "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ + "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" +#define YUVTORGB_REGS_AVX2 +#endif + +#define YUVTORGB_AVX2(yuvconstants) \ + YUVTORGB16_AVX2(yuvconstants) \ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" -#define YUVTORGB_REGS_AVX2 -#endif // Store 16 ARGB values. #define STOREARGB_AVX2 \ @@ -2333,6 +2330,33 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "lea 0x40(%[dst_argb]), %[dst_argb] \n" +// Store 16 AR30 values. +#define STOREAR30_AVX2 \ + "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ + "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ + "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ + "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ + "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ + "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ + "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ + "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm1,%%ymm1 \n" \ + "vpslld $0xa,%%ymm2,%%ymm2 \n" \ + "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ + "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ + "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ + "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" + #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). @@ -2402,6 +2426,46 @@ void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_I422TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_I422TOAR30ROW_AVX2 + #if defined(HAS_I210TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 7c5526986..d33511b0f 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -594,6 +594,7 @@ TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ARGB, 4) +TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1, 0, ARGB, 4) // TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1, 0, ABGR, 4) #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \