diff --git a/README.chromium b/README.chromium index 5649bef1f..36b1d8ad9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1159 +Version: 1160 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d2fb55934..95083511e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -203,6 +203,7 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 #endif // The following are require VS2012. @@ -220,7 +221,6 @@ extern "C" { // Effects: #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 -#define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #endif // defined(VISUALC_HAS_AVX2) diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 132c83fd0..ad93f49bc 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1159 +#define LIBYUV_VERSION 1160 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 1cde6ba66..f71752665 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3890,7 +3890,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN @@ -3925,6 +3925,45 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBMULTIPLYROW_SSE2 +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 4 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "vmovdqu " MEMACCESS(1) ",%%ymm3 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb0), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__AVX2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,