From cc88adc6209f3aa9c25adc6be02743fa2e9e9c80 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 23 Jun 2016 20:16:55 -0700 Subject: [PATCH] YUV scale filter columns improved filtering accuracy upscale a YUV image. observe change in hue.. green especially. disable ScaleFilterCols_SSSE3, falling back on ScaleFilterCols_C observe hue.. green especially, is better. was ScaleFrom1280x720_Bilinear (1620 ms) now ScaleFrom1280x720_Bilinear (1907 ms) BUG=libyuv:605 TEST=try bots R=harryjin@google.com, wangcheng@google.com Review URL: https://codereview.chromium.org/2084533006 . --- README.chromium | 2 +- include/libyuv/scale_row.h | 3 +-- include/libyuv/version.h | 2 +- source/scale_common.cc | 14 +++++++++-- source/scale_gcc.cc | 50 +++++++++++++++++++++++++++++--------- source/scale_neon.cc | 4 +++ source/scale_win.cc | 35 +++++++++++++++++++------- unit_test/scale_test.cc | 10 ++++---- 8 files changed, 88 insertions(+), 32 deletions(-) diff --git a/README.chromium b/README.chromium index a1c37a3c2..ca11605ae 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1599 +Version: 1600 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 3064ddbdd..df699e6c2 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -61,8 +61,7 @@ extern "C" { #define HAS_SCALEARGBROWDOWN2_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALECOLSUP2_SSE2 -// TODO(fbarchard): HAS_SCALEFILTERCOLS_SSSE3 doesnt match C very well. -// #define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALEFILTERCOLS_SSSE3 #define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8eb80e31a..6434a8dbd 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1599 +#define LIBYUV_VERSION 1600 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale_common.cc b/source/scale_common.cc index d3992df2e..baed70b9d 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -417,8 +417,16 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr, } // (1-f)a + fb can be replaced with a + f(b-a) +#if defined(__arm__) +// arm uses 16 bit math with truncation. +// TODO(fbarchard): add rounding. #define BLENDER(a, b, f) (uint8)((int)(a) + \ - ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + (((int)((f)) * ((int)(b) - (int)(a))) >> 16)) +#else +// inteluses 7 bit math with rounding. +#define BLENDER(a, b, f) (uint8)((int)(a) + \ + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) +#endif void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { @@ -470,8 +478,9 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr, } #undef BLENDER +// Same as 8 bit arm blender but return is cast to uint16 #define BLENDER(a, b, f) (uint16)((int)(a) + \ - ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + (((int)((f)) * ((int)(b) - (int)(a))) >> 16)) void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr, int dst_width, int x, int dx) { @@ -809,6 +818,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, } } +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=605. // Mimics SSSE3 blender #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 #define BLENDERC(a, b, f, s) (uint32)( \ diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 400f2fde9..8d234edaf 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -821,6 +821,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } #endif // HAS_SCALEADDROW_AVX2 +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static uvec8 kFsub80 = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +// Constant for making pixels unsigned and adding .5 for rounding. +static uvec16 kFadd40 = + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; + // Bilinear column filtering. SSSE3 version. void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { @@ -831,7 +841,10 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movl $0x04040000,%k2 \n" "movd %k2,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" // 0x007f007f + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $15,%%xmm7 \n" // 0x00010001 + "pextrw $0x1,%%xmm2,%k3 \n" "subl $0x2,%5 \n" "jl 29f \n" @@ -853,13 +866,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movd %k2,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. + "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 + "paddusb %%xmm7,%%xmm1 \n" + "pmaddubsw %%xmm0,%%xmm1 \n" "pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x3,%%xmm2,%k4 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" + "paddw %9,%%xmm1 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movd %%xmm1,%k2 \n" "mov %w2," MEMACCESS(0) " \n" "lea " MEMLEA(0x2,0) ",%0 \n" "sub $0x2,%5 \n" @@ -873,11 +889,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm2 \n" "pshufb %%xmm5,%%xmm2 \n" + "psubb %8,%%xmm0 \n" // make pixels signed. "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k2 \n" + "paddusb %%xmm7,%%xmm2 \n" + "pmaddubsw %%xmm0,%%xmm2 \n" + "paddw %9,%%xmm2 \n" // make pixels unsigned. + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,%k2 \n" "mov %b2," MEMACCESS(0) " \n" "99: \n" : "+r"(dst_ptr), // %0 @@ -887,9 +906,16 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "=&r"(x1), // %4 "+rm"(dst_width) // %5 : "rm"(x), // %6 - "rm"(dx) // %7 + "rm"(dx), // %7 +#if defined(__x86_64__) + "x"(kFsub80), // %8 + "x"(kFadd40) // %9 +#else + "m"(kFsub80), // %8 + "m"(kFadd40) // %9 +#endif : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 95f3362a4..26bb70592 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -572,6 +572,10 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, MEMACCESS(6) \ "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" +// The NEON version mimics this formula: +// #define BLENDER(a, b, f) (uint8)((int)(a) + +// ((int)(f) * ((int)(b) - (int)(a)) >> 16)) + void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; diff --git a/source/scale_win.cc b/source/scale_win.cc index 21b1ed923..f17097365 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -860,6 +860,16 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { } #endif // HAS_SCALEADDROW_AVX2 +// Constant for making pixels signed to avoid pmaddubsw +// saturation. +static uvec8 kFsub80 = + { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; + +// Constant for making pixels unsigned and adding .5 for rounding. +static uvec16 kFadd40 = + { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; + // Bilinear column filtering. SSSE3 version. __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, @@ -877,6 +887,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm5, eax pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 + pcmpeqb xmm7, xmm7 // generate 0x0001 + psrlw xmm7, 15 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 @@ -899,20 +911,22 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm4, ebx pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits, 2 pixels. - movd ebx, xmm0 + paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm1, xmm1 // 8 bits, 2 pixels. + movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] sub ecx, 2 // 2 pixels jge xloop2 xloop29: - add ecx, 2 - 1 jl xloop99 @@ -921,11 +935,14 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. pshufb xmm2, xmm5 // 0011 + psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // 16 bit - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits - movd ebx, xmm0 + paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 + pmaddubsw xmm2, xmm0 // 16 bit + paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. + psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm2, xmm2 // 8 bits + movd ebx, xmm2 mov [edi], bl xloop99: diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 7c9409631..f40443e29 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -314,10 +314,10 @@ static int TestFilter_16(int src_width, int src_height, TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) -TEST_FACTOR(8, 1, 8, 3) +TEST_FACTOR(8, 1, 8, 0) TEST_FACTOR(3by4, 3, 4, 1) TEST_FACTOR(3by8, 3, 8, 1) -TEST_FACTOR(3, 1, 3, 3) +TEST_FACTOR(3, 1, 3, 0) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX @@ -356,9 +356,9 @@ TEST_FACTOR(3, 1, 3, 3) // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 3) \ - TEST_SCALETO1(name, width, height, Bilinear, 3) \ - TEST_SCALETO1(name, width, height, Box, 3) + TEST_SCALETO1(name, width, height, Linear, 0) \ + TEST_SCALETO1(name, width, height, Bilinear, 0) \ + TEST_SCALETO1(name, width, height, Box, 0) TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240)