From ae55e418517651548638b27be31d1b2abaed22bb Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 14 Dec 2015 17:25:36 -0800 Subject: [PATCH] use rounding in scaledown by 2 When scaling down by 2 the formula should round consistently. (a+b+c+d+2)/4 The C version did but the SSE2 version was doing 2 averages. avg(avg(a,b),avg(c,d)) This change uses a sum, then rounds. R=dhrosa@google.com, harryjin@google.com BUG=libyuv:447,libyuv:527 Review URL: https://codereview.chromium.org/1513183004 . --- README.chromium | 2 +- include/libyuv/scale_row.h | 24 +++++------ include/libyuv/version.h | 2 +- source/planar_functions.cc | 8 ++-- source/scale.cc | 16 ++++---- source/scale_any.cc | 8 ++-- source/scale_gcc.cc | 62 ++++++++++++++-------------- source/scale_win.cc | 84 +++++++++++++++++++------------------- unit_test/planar_test.cc | 4 +- 9 files changed, 106 insertions(+), 104 deletions(-) diff --git a/README.chromium b/README.chromium index 4ae67c015..84220bb9e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1553 +Version: 1554 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index d198addfd..353bc59a2 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -56,7 +56,7 @@ extern "C" { #define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN4_SSE2 @@ -232,12 +232,12 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); // Specialized scalers for x86. -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0ee1518e4..177c98a41 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1553 +#define LIBYUV_VERSION 1554 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2731bd0da..062da932f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, } } #endif -#if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2; +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; if (IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_SSE2; + ScaleRowDown2 = ScaleRowDown2Box_SSSE3; } } #endif diff --git a/source/scale.cc b/source/scale.cc index 0a01304c4..c1cd77c7f 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height, } } #endif -#if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : - ScaleRowDown2Box_Any_SSE2); +#if defined(HAS_SCALEROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : + ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : + ScaleRowDown2Box_SSSE3); } } #endif diff --git a/source/scale_any.cc b/source/scale_any.cc index 2f6a2c8ba..2e1a8075f 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, dst_ptr + n * BPP, r); \ } -#ifdef HAS_SCALEROWDOWN2_SSE2 -SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) -SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, +#ifdef HAS_SCALEROWDOWN2_SSSE3 +SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, +SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, 2, 1, 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index f0509a069..eeeb165cb 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -98,8 +98,8 @@ static uvec16 kScaleAb2 = // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( LABELALIGN "1: \n" @@ -120,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" @@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 - :: "memory", "cc", "xmm0", "xmm1", "xmm5" + :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5" ); } -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" @@ -164,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 "lea " MEMLEA(0x20,0) ",%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" diff --git a/source/scale_win.cc b/source/scale_win.cc index f48a4ee76..6930f7295 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -95,8 +95,8 @@ static uvec16 kScaleAb2 = // Reads 32 pixels, throws half away and writes 16 pixels. __declspec(naked) -void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored @@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x1 rectangle to 16x1. __declspec(naked) -void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 packuswb xmm0, xmm1 - movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -157,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x2 rectangle to 16x1. __declspec(naked) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 + + pcmpeqb xmm4, xmm4 // constant 0x0101 + psrlw xmm4, 15 + packuswb xmm4, xmm4 + pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] @@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 + pmaddubsw xmm0, xmm4 // horizontal add + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + paddw xmm0, xmm2 // vertical add + paddw xmm1, xmm3 + psrlw xmm0, 1 + psrlw xmm1, 1 + pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm1, xmm5 packuswb xmm0, xmm1 - movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] - - vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } } +// For rounding, average = (sum + 2) / 4 +// becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpxor ymm5, ymm5, ymm5 // constant 0 wloop: - vmovdqu ymm0, [eax] // average rows + vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi] + vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] - - vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + vpaddw ymm0, ymm0, ymm2 // vertical add + vpaddw ymm1, ymm1, ymm3 + vpsrlw ymm0, ymm0, 1 + vpsrlw ymm1, ymm1, 1 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb - vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 271e0cc5e..f31e45f79 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations, EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]); } for (int i = 0; i < kSizeUV; ++i) { - EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1 - EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1); + EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]); + EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]); } free_aligned_buffer_64(src_y0); free_aligned_buffer_64(src_u0);