diff --git a/README.chromium b/README.chromium index 84220bb9e..16accaaf6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1554 +Version: 1555 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 353bc59a2..15e0598fe 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -69,12 +69,12 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 +#define HAS_SCALEROWDOWN2_AVX2 #endif // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) -#define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 177c98a41..0b36f7cfb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1554 +#define LIBYUV_VERSION 1555 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index eeeb165cb..9424ecedd 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -188,6 +188,104 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } +#ifdef HAS_SCALEROWDOWN2_AVX2 +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1" + ); +} + +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" + ); +} + +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $0xf,%%ymm4,%%ymm4 \n" + "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2 + MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vpsrlw $0x1,%%ymm0,%%ymm0 \n" + "vpsrlw $0x1,%%ymm1,%%ymm1 \n" + "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" + "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)) // %3 + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_SCALEROWDOWN2_AVX2 + void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 126a737ff..ce474fa1e 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -298,18 +298,18 @@ static int TestFilter_16(int src_width, int src_height, // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom) \ +#define TEST_FACTOR(name, nom, denom, maxdiff) \ TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, 3) + TEST_FACTOR1(name, Linear, nom, denom, maxdiff) \ + TEST_FACTOR1(name, Bilinear, nom, denom, maxdiff) \ + TEST_FACTOR1(name, Box, nom, denom, maxdiff) -TEST_FACTOR(2, 1, 2) -TEST_FACTOR(4, 1, 4) -TEST_FACTOR(8, 1, 8) -TEST_FACTOR(3by4, 3, 4) -TEST_FACTOR(3by8, 3, 8) -TEST_FACTOR(3, 1, 3) +TEST_FACTOR(2, 1, 2, 0) +TEST_FACTOR(4, 1, 4, 3) +TEST_FACTOR(8, 1, 8, 3) +TEST_FACTOR(3by4, 3, 4, 3) +TEST_FACTOR(3by8, 3, 8, 3) +TEST_FACTOR(3, 1, 3, 3) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX