diff --git a/README.chromium b/README.chromium index 707cdd1dc..4efd81767 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1193 +Version: 1194 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index efcc222b1..0571f38e7 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -189,6 +189,7 @@ extern "C" { #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX @@ -215,12 +216,6 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_AVX2 #endif -// The following are require VS2012. -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -#define HAS_ARGBTOUVROW_AVX2 -#endif // defined(VISUALC_HAS_AVX2) - // The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b8bb9d85e..793a5a261 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1193 +#define LIBYUV_VERSION 1194 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 55727301c..d7b71bf6c 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -953,6 +953,74 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVROW_AVX2 +// vpshufb for vphaddw + vpackuswb packed to shorts. +static const lvec8 kShufARGBToUV_AVX = { + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 +}; +void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" + VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 + VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) + VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) + VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) + "lea " MEMLEA(0x80,0) ",%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" + VEXTOPMEM(vextractf128,ymm0,0x1,1,2,1) // vextractf128 $0x1,%%ymm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(kARGBToV), // %5 + "m"(kARGBToU), // %6 + "m"(kAddUV128), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBTOUVROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_SSSE3 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,