diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 15e0598fe..0feff6b91 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -59,7 +59,7 @@ extern "C" { #define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN4_SSSE3 #define HAS_SCALEADDROW_SSE2 #endif @@ -244,10 +244,10 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -281,10 +281,10 @@ void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/libyuv.gyp b/libyuv.gyp index 4e42312fc..9cbf641e4 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -47,7 +47,7 @@ # Change type to 'shared_library' to build .so or .dll files. 'type': 'static_library', 'variables': { - 'optimize': 'max', # enable O2 and ltcg. + # 'optimize': 'max', # enable O2 and ltcg. }, # Allows libyuv.a redistributable library without external dependencies. 'standalone_static_library': 1, diff --git a/source/scale.cc b/source/scale.cc index c1cd77c7f..0dfd99b20 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -182,12 +182,12 @@ static void ScalePlaneDown4(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { +#if defined(HAS_SCALEROWDOWN4_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown4 = filtering ? - ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2; + ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } } #endif diff --git a/source/scale_any.cc b/source/scale_any.cc index 2e1a8075f..1da4dacde 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -76,9 +76,9 @@ SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, ScaleRowDown2Box_C, 2, 1, 15) #endif -#ifdef HAS_SCALEROWDOWN4_SSE2 -SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) -SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, +#ifdef HAS_SCALEROWDOWN4_SSSE3 +SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, 4, 1, 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 9424ecedd..5d10a01fa 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -286,7 +286,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } #endif // HAS_SCALEROWDOWN2_AVX2 -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -314,12 +314,15 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { intptr_t stridex3 = 0; asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "packuswb %%xmm4,%%xmm4 \n" + "psllw $0x3,%%xmm5 \n" "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" LABELALIGN @@ -328,31 +331,29 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3 - MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4 - MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5 + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm7,%%xmm2 \n" - "pand %%xmm7,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "pavgw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" + "psrlw $0x4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x8,%2 \n" @@ -363,7 +364,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "+r"(stridex3) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } diff --git a/source/scale_win.cc b/source/scale_win.cc index 6930f7295..5ab4fa0cc 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -309,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, // Point samples 32 pixels to 8 pixels. __declspec(naked) -void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr @@ -340,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x4 rectangle to 8x1. __declspec(naked) -void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi @@ -350,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + movdqa xmm5, xmm4 + packuswb xmm4, xmm4 + psllw xmm5, 3 // constant 0x0008 wloop: movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add rows 0, 1 + paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] - movdqu xmm4, [eax + edi] - movdqu xmm5, [eax + edi + 16] + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 2 + paddw xmm1, xmm3 + movdqu xmm2, [eax + edi] + movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa xmm2, xmm0 // average columns (16 to 8 pixels) - psrlw xmm0, 8 - pand xmm2, xmm7 - pavgw xmm0, xmm2 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // add row 3 + paddw xmm1, xmm3 + phaddw xmm0, xmm1 + paddw xmm0, xmm5 // + 8 for round + psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 8 @@ -444,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 - vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff - vpsrlw ymm7, ymm7, 8 + vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 + vpsrlw ymm4, ymm4, 15 + vpsllw ymm5, ymm4, 3 // constant 0x0008 + vpackuswb ymm4, ymm4, ymm4 wloop: vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 + vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] - vpavgb ymm2, ymm2, [eax + edi] - vpavgb ymm3, ymm3, [eax + edi + 32] - lea eax, [eax + 64] - vpavgb ymm0, ymm0, ymm2 - vpavgb ymm1, ymm1, ymm3 - - vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) - vpand ymm3, ymm1, ymm7 - vpsrlw ymm0, ymm0, 8 - vpsrlw ymm1, ymm1, 8 - vpavgw ymm0, ymm0, ymm2 - vpavgw ymm1, ymm1, ymm3 - vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - - vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) - vpsrlw ymm0, ymm0, 8 - vpavgw ymm0, ymm0, ymm2 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 2 + vpaddw ymm1, ymm1, ymm3 + vmovdqu ymm2, [eax + edi] + vmovdqu ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // add row 3 + vpaddw ymm1, ymm1, ymm3 + vphaddw ymm0, ymm0, ymm1 // mutates + vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw + vpaddw ymm0, ymm0, ymm5 // + 8 for round + vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index ce474fa1e..f31af80b3 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -298,17 +298,17 @@ static int TestFilter_16(int src_width, int src_height, // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, nom, denom, maxdiff) \ +#define TEST_FACTOR(name, nom, denom, boxdiff) \ TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, maxdiff) \ - TEST_FACTOR1(name, Bilinear, nom, denom, maxdiff) \ - TEST_FACTOR1(name, Box, nom, denom, maxdiff) + TEST_FACTOR1(name, Linear, nom, denom, 3) \ + TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(name, Box, nom, denom, boxdiff) TEST_FACTOR(2, 1, 2, 0) -TEST_FACTOR(4, 1, 4, 3) +TEST_FACTOR(4, 1, 4, 0) TEST_FACTOR(8, 1, 8, 3) -TEST_FACTOR(3by4, 3, 4, 3) -TEST_FACTOR(3by8, 3, 8, 3) +TEST_FACTOR(3by4, 3, 4, 1) +TEST_FACTOR(3by8, 3, 8, 1) TEST_FACTOR(3, 1, 3, 3) #undef TEST_FACTOR1 #undef TEST_FACTOR