diff --git a/README.chromium b/README.chromium index 21b0a31fd..94d2030dc 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1183 +Version: 1184 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 93340793e..ef0c36f0c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -203,6 +203,8 @@ extern "C" { #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -216,8 +218,6 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #define HAS_ARGBTOUVROW_AVX2 -#define HAS_ARGBTOYJROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #endif // defined(VISUALC_HAS_AVX2) diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b666cb571..d6209bf43 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1183 +#define LIBYUV_VERSION 1184 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 8ce59178c..ea33f8606 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -706,8 +706,8 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { #ifdef HAS_ARGBTOYROW_SSSE3 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" @@ -782,6 +782,97 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYJROW_SSSE3 +#ifdef HAS_ARGBTOYROW_AVX2 +// vpermd for vphaddw + vpackuswb vpermd. +static const lvec32 kPermdARGBToY_AVX = { + 0, 4, 1, 5, 2, 6, 3, 7 +}; + +void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x10,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x30,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBTOYROW_AVX2 + +#ifdef HAS_ARGBTOYJROW_AVX2 +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x10,0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "vmovdqu " MEMACCESS2(0x30,0) ",%%ymm3 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "lea " MEMLEA(0x80,0) ",%0 \n" + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" + "vpsrlw $0x7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x7,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" + "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToYJ), // %3 + "m"(kAddY16), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBTOYJROW_AVX2 + #ifdef HAS_ARGBTOUVROW_SSSE3 // TODO(fbarchard): pass xmm constants to single block of assembly. // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes