From f7eb04bc416ccfa2db5f3ca8fbc3fd78f9c0d24b Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 11 Nov 2013 23:13:57 +0000 Subject: [PATCH] Port ScaleCols to SSSE3 for Win. BUG=none TEST=Scale* R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/3759004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@849 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/scale.cc | 101 ++++++++++++++++++++++++++++++++++++++- unit_test/scale_test.cc | 6 +-- 4 files changed, 104 insertions(+), 7 deletions(-) diff --git a/README.chromium b/README.chromium index 186cdcb3e..c7b3d7147 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 848 +Version: 849 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 674592788..ced2cbf60 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 848 +#define LIBYUV_VERSION 849 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 6c708c795..8b3a5cd3c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -870,6 +870,94 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 4u, 4u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, 80u, +}; + +#define HAS_SCALEFILTERCOLS_SSSE3 +__declspec(naked) __declspec(align(16)) +static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 16 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm7, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm7 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov word ptr [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 16 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movdqa xmm1, xmm2 // x0, x1 fractions. + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + pshufb xmm1, xmm5 // 0011 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov byte ptr [edi], bl + + align 16 + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + #elif !defined(LIBYUV_DISABLE_X86) && \ ((defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) // GCC versions of row functions are verbatim conversions from Visual C. @@ -2297,6 +2385,15 @@ void ScalePlaneBilinear(int src_width, int src_height, } } #endif + + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleFilterCols_C; +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + int dx = 0; int dy = 0; int x = 0; @@ -2327,11 +2424,11 @@ void ScalePlaneBilinear(int src_width, int src_height, int yi = y >> 16; const uint8* src = src_ptr + yi * src_stride; if (filtering == kFilterLinear) { - ScaleFilterCols_C(dst_ptr, src, dst_width, x, dx); + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; InterpolateRow(row, src, src_stride, src_width, yf); - ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 2fa904bf8..807907759 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -179,9 +179,9 @@ TEST_FACTOR(Vertical2by3, 1, 2 / 3) // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 0) \ - TEST_SCALETO1(name, width, height, Bilinear, 2) \ - TEST_SCALETO1(name, width, height, Box, 2) + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) \ + TEST_SCALETO1(name, width, height, Box, 3) TEST_SCALETO(Scale, 640, 360) TEST_SCALETO(Scale, 853, 480)