From 98a1fbf5e9797112515d591b1262db6ae049b8fa Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Sun, 7 Apr 2013 04:07:08 +0000 Subject: [PATCH] Scale up columns 2 pixels at a time BUG=208 TEST=out\release\libyuv_unittest --gtest_filter=*Scale*640* Review URL: https://webrtc-codereview.appspot.com/1294004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@648 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_win.cc | 44 +++++++++--------- source/scale_argb.cc | 90 +++++++++++++++++++++++++----------- unit_test/scale_argb_test.cc | 4 +- 5 files changed, 90 insertions(+), 52 deletions(-) diff --git a/README.chromium b/README.chromium index c379f7300..d072a23bf 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 646 +Version: 648 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 738ec4f97..f2197fce5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 646 +#define LIBYUV_VERSION 648 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index 7322d977e..3ec2ed472 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3043,12 +3043,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, pxor xmm5, xmm5 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - mov eax,0x00100010 - movd xmm3,eax - pshufd xmm3,xmm3,0 - mov eax,0x004a004a // 74 - movd xmm2,eax - pshufd xmm2,xmm2,0 + mov eax, 0x00100010 + movd xmm3, eax + pshufd xmm3, xmm3, 0 + mov eax, 0x004a004a // 74 + movd xmm2, eax + pshufd xmm2, xmm2,0 mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width @@ -4267,8 +4267,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pxor xmm3, xmm4 // ~alpha movd xmm2, [esi] // _r_b psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha @@ -4298,8 +4298,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pxor xmm3, xmm4 // ~alpha movdqu xmm2, [esi] // _r_b psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha @@ -4329,8 +4329,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, pxor xmm3, xmm4 // ~alpha movd xmm2, [esi] // _r_b psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h + pshufhw xmm3, xmm3, 0F5h // 8 alpha words + pshuflw xmm3, xmm3, 0F5h pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha @@ -4363,8 +4363,8 @@ static const uvec8 kShuffleAlpha = { }; // Same as SSE2, but replaces: // psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3,0F5h // 8 alpha words -// pshuflw xmm3, xmm3,0F5h +// pshufhw xmm3, xmm3, 0F5h // 8 alpha words +// pshuflw xmm3, xmm3, 0F5h // with.. // pshufb xmm3, kShuffleAlpha // alpha // Blend 8 pixels at a time. @@ -4533,13 +4533,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { convertloop: movdqa xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 - pshufhw xmm2, xmm0,0FFh // 8 alpha words - pshuflw xmm2, xmm2,0FFh + pshufhw xmm2, xmm0, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh pmulhuw xmm0, xmm2 // rgb * a movdqa xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm1 // next 2 pixels - pshufhw xmm2, xmm1,0FFh // 8 alpha words - pshuflw xmm2, xmm2,0FFh + pshufhw xmm2, xmm1, 0FFh // 8 alpha words + pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a movdqa xmm2, [eax] // alphas psrlw xmm0, 8 @@ -4673,8 +4673,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, punpcklbw xmm0, xmm0 // first 2 movd xmm2, dword ptr fixed_invtbl8[esi * 4] movd xmm3, dword ptr fixed_invtbl8[edi * 4] - pshuflw xmm2, xmm2,040h // first 4 inv_alpha words. 1, a, a, a - pshuflw xmm3, xmm3,040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm0, xmm2 // rgb * a @@ -4684,8 +4684,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, punpckhbw xmm1, xmm1 // next 2 movd xmm2, dword ptr fixed_invtbl8[esi * 4] movd xmm3, dword ptr fixed_invtbl8[edi * 4] - pshuflw xmm2, xmm2,040h // first 4 inv_alpha words - pshuflw xmm3, xmm3,040h // next 4 inv_alpha words + pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words + pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm1, xmm2 // rgb * a diff --git a/source/scale_argb.cc b/source/scale_argb.cc index f55ea0ecb..fd5b07a7f 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -424,46 +424,86 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_argb, const uint8* src_argb, // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. // TODO(fbarchard): Port to Neon // TODO(fbarchard): Port to Posix -// TODO(fbarchard): Unroll for 2 pixels for better pairing and memory access. +// TODO(fbarchard): Consider lea to get 2nd pixel without incrementing. + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static const uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static const uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, +}; + #define HAS_SCALEARGBFILTERCOLS_SSSE3 __declspec(naked) __declspec(align(16)) static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { push ebx + push ebp push esi push edi - mov edi, [esp + 12 + 4] // dst_argb - mov esi, [esp + 12 + 8] // src_argb - mov ecx, [esp + 12 + 12] // dst_width - mov edx, [esp + 12 + 16] // x - mov ebx, [esp + 12 + 20] // dx + mov edi, [esp + 16 + 4] // dst_argb + mov esi, [esp + 16 + 8] // src_argb + mov ecx, [esp + 16 + 12] // dst_width + mov edx, [esp + 16 + 16] // x + mov ebx, [esp + 16 + 20] // dx + movdqa xmm3, kShuffleFractions + movdqa xmm4, kShuffleColARGB pcmpeqb xmm5, xmm5 // generate 0x007f for inverting fraction. psrlw xmm5, 9 + sub ecx, 2 + jl xloop29 align 16 - xloop: - mov eax, edx // get x integer offset - shr eax, 16 - movq xmm0, qword ptr [esi + eax * 4] // 2 source pixels - pshufd xmm1, xmm0, 1 // second pixel - punpcklbw xmm0, xmm1 // aarrggbb - movd xmm2, edx // get x fraction - psrlw xmm2, 9 // 7 bit fraction - punpcklbw xmm2, xmm2 - punpcklwd xmm2, xmm2 - pshufd xmm2, xmm2, 0 - pxor xmm2, xmm5 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 + xloop2: + mov eax, edx // get x0 integer + movd xmm1, edx // get x0 fraction + lea ebp, [edx + ebx] // get x1 integer (x + dx) + movd xmm2, ebp // get x1 fraction + shr eax, 16 // x0 + punpcklwd xmm1, xmm2 // x0x1 fractions + lea edx, [edx + ebx * 2] // x += dx * 2 + shr ebp, 16 // x1 + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + movhps xmm0, qword ptr [esi + ebp * 4] // 2 source x1 pixels + psrlw xmm1, 9 // 7 bit fractions. + pshufb xmm1, xmm3 // 0000000011111111 + sub ecx, 2 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm5 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. psrlw xmm0, 7 - packuswb xmm0, xmm0 - add edx, ebx // x += dx - sub ecx, 1 + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + jge xloop2 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + mov eax, edx // get x0 integer + movd xmm1, edx // get x0 fraction + shr eax, 16 // x0 + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + pshufb xmm1, xmm3 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm5 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 - lea edi, [edi + 4] - jg xloop + xloop99: + pop edi pop esi + pop ebp pop ebx ret } @@ -1104,8 +1144,6 @@ static void ScaleARGBBilinear(int src_width, int src_height, ScaleARGBFilterRows = ScaleARGBFilterRows_NEON; } #endif - - int dx = (src_width << 16) / dst_width; int dy = (src_height << 16) / dst_height; int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 8783f2a6f..7918d5bf7 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -410,7 +410,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) { dst_width, dst_height, kFilterBilinear, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_LE(max_diff, 3); } TEST_F(libyuvTest, ARGBScaleFrom640x360_None) { @@ -436,7 +436,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) { dst_width, dst_height, kFilterBilinear, benchmark_iterations_); - EXPECT_LE(max_diff, 2); + EXPECT_LE(max_diff, 3); } } // namespace libyuv