diff --git a/README.chromium b/README.chromium index 5001ce745..21b0a31fd 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1182 +Version: 1183 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9ef551d66..93340793e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -192,6 +192,7 @@ extern "C" { #define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOBGRAROW_AVX2 #define HAS_I422TOABGRROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 #define HAS_YUY2TOYROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 @@ -214,7 +215,6 @@ extern "C" { // The following are require VS2012. // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -#define HAS_I422TORGBAROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c5b249f8b..b666cb571 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1182 +#define LIBYUV_VERSION 1183 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index d44d1abeb..8ce59178c 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2187,6 +2187,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" "sub $0x10,%[width] \n" "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2206,7 +2207,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, #if defined(HAS_I422TOABGRROW_AVX2) // 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2233,6 +2234,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" "sub $0x10,%[width] \n" "jg 1b \n" + "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] @@ -2248,7 +2250,54 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, #endif ); } -#endif // HAS_I422TOARGBROW_AVX2 +#endif // HAS_I422TOABGRROW_AVX2 + +#if defined(HAS_I422TORGBAROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). +void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpxor %%ymm4,%%ymm4,%%ymm4 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2 + + // Step 3: Weave into RGBA + "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants_AVX.kUVToB_AVX) // %[kYuvConstants] + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_I422TORGBAROW_AVX2 #ifdef HAS_YTOARGBROW_SSE2 void YToARGBRow_SSE2(const uint8* y_buf,