diff --git a/README.chromium b/README.chromium index d3b266d46..373465f0a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1180 +Version: 1181 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 471d66a41..bc4cad02a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -189,6 +189,7 @@ extern "C" { #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOBGRAROW_AVX2 #define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 @@ -215,7 +216,6 @@ extern "C" { #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 -#define HAS_I422TOARGBROW_AVX2 #define HAS_I422TORGBAROW_AVX2 #define HAS_I422TOABGRROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 220b31d68..f9491632f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1180 +#define LIBYUV_VERSION 1181 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index ce63299be..1235bfbce 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1950,6 +1950,8 @@ struct { YG, YG, YG, YG, YG, YG, YG, YG } }; +// 32 pixels +// 16 UV values upsampled to 32 UV, mixed with 32 Y producing 32 BGRA pixels. void I422ToBGRARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2120,7 +2122,87 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, #endif ); } -#endif // HAS_I422ToBGRAROW_AVX2 +#endif // HAS_I422TOBGRAROW_AVX2 + +#if defined(HAS_I422TOARGBROW_AVX2) +// Read 8 UV from 422, upsample to 16 UV. +#define READYUV422_AVX2 \ + "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" + +// Convert 16 pixels: 16 UV and 16 Y. +#define YUVTORGB_AVX2 \ + "vpmaddubsw " MEMACCESS2(64, [kYuvConstants]) ",%%ymm0,%%ymm2 \n" \ + "vpmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%ymm0,%%ymm1 \n" \ + "vpmaddubsw " MEMACCESS([kYuvConstants]) ",%%ymm0,%%ymm0 \n" \ + "vpsubw " MEMACCESS2(160, [kYuvConstants]) ",%%ymm2,%%ymm2 \n" \ + "vpsubw " MEMACCESS2(128, [kYuvConstants]) ",%%ymm1,%%ymm1 \n" \ + "vpsubw " MEMACCESS2(96, [kYuvConstants]) ",%%ymm0,%%ymm0 \n" \ + "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ + "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ + "vpunpcklbw %%ymm4,%%ymm3,%%ymm3 \n" \ + "vpsubsw " MEMACCESS2(192, [kYuvConstants]) ",%%ymm3,%%ymm3 \n" \ + "vpmullw " MEMACCESS2(224, [kYuvConstants]) ",%%ymm3,%%ymm3 \n" \ + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" \ + "vpaddsw %%ymm3,%%ymm1,%%ymm1 \n" \ + "vpaddsw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ + "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ + "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" + +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm4,%%ymm4,%%ymm4 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2 + + // Step 3: Weave into ARGB + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels + + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants_AVX.kUVToB_AVX) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_I422TOARGBROW_AVX2 #ifdef HAS_YTOARGBROW_SSE2 void YToARGBRow_SSE2(const uint8* y_buf,