diff --git a/README.chromium b/README.chromium index 576608b52..e5ddebfa3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1261 +Version: 1262 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 982523be2..5a444590f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -203,6 +203,7 @@ extern "C" { #define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 +#define HAS_YTOARGBROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 @@ -217,7 +218,6 @@ extern "C" { // The following are available require VS2012. Port to GCC. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -#define HAS_YTOARGBROW_AVX2 // TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 #define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGBROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 49d5581e4..c379dcea1 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1261 +#define LIBYUV_VERSION 1262 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 3cec55027..28e5a7a84 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2292,9 +2292,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, #endif // HAS_I422TORGBAROW_AVX2 #ifdef HAS_YTOARGBROW_SSE2 -void YToARGBRow_SSE2(const uint8* y_buf, - uint8* dst_argb, - int width) { +void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { asm volatile ( "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 "movd %%eax,%%xmm2 \n" @@ -2340,6 +2338,55 @@ void YToARGBRow_SSE2(const uint8* y_buf, } #endif // HAS_YTOARGBROW_SSE2 +#ifdef HAS_YTOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +// note: vpunpcklbw mutates and vpackuswb unmutates. +void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { + asm volatile ( + "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 + "vmovd %%eax,%%xmm2 \n" + "vbroadcastss %%xmm2,%%ymm2 \n" + "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpslld $0x18,%%ymm4,%%ymm4 \n" + \n" + LABELALIGN + "1: \n" + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + "vmovdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" + "vpor %%ymm4,%%ymm0,%%ymm0 \n" + "vpor %%ymm4,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : + : "memory", "cc", "eax" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_YTOARGBROW_AVX2 + #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static uvec8 kShuffleMirror = { diff --git a/source/row_win.cc b/source/row_win.cc index 0a89e28c1..49df09261 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2354,14 +2354,14 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* rgb_buf, int width) { __asm { - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 - vpslld ymm4, ymm4, 24 - mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) - vmovd xmm3, eax - vbroadcastss ymm3, xmm3 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax vbroadcastss ymm2, xmm2 + mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb