mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Scale up columns 2 pixels at a time
BUG=208 TEST=out\release\libyuv_unittest --gtest_filter=*Scale*640* Review URL: https://webrtc-codereview.appspot.com/1294004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@648 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
a007046105
commit
98a1fbf5e9
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 646
|
||||
Version: 648
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 646
|
||||
#define LIBYUV_VERSION 648
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -3043,12 +3043,12 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
pxor xmm5, xmm5
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||
pslld xmm4, 24
|
||||
mov eax,0x00100010
|
||||
movd xmm3,eax
|
||||
pshufd xmm3,xmm3,0
|
||||
mov eax,0x004a004a // 74
|
||||
movd xmm2,eax
|
||||
pshufd xmm2,xmm2,0
|
||||
mov eax, 0x00100010
|
||||
movd xmm3, eax
|
||||
pshufd xmm3, xmm3, 0
|
||||
mov eax, 0x004a004a // 74
|
||||
movd xmm2, eax
|
||||
pshufd xmm2, xmm2,0
|
||||
mov eax, [esp + 4] // Y
|
||||
mov edx, [esp + 8] // rgb
|
||||
mov ecx, [esp + 12] // width
|
||||
@ -4267,8 +4267,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movd xmm2, [esi] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3, 0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
@ -4298,8 +4298,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqu xmm2, [esi] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3, 0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
@ -4329,8 +4329,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movd xmm2, [esi] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pshufhw xmm3, xmm3, 0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3, 0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
@ -4363,8 +4363,8 @@ static const uvec8 kShuffleAlpha = {
|
||||
};
|
||||
// Same as SSE2, but replaces:
|
||||
// psrlw xmm3, 8 // alpha
|
||||
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
// pshuflw xmm3, xmm3,0F5h
|
||||
// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
|
||||
// pshuflw xmm3, xmm3, 0F5h
|
||||
// with..
|
||||
// pshufb xmm3, kShuffleAlpha // alpha
|
||||
// Blend 8 pixels at a time.
|
||||
@ -4533,13 +4533,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // read 4 pixels
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
pshufhw xmm2, xmm0,0FFh // 8 alpha words
|
||||
pshuflw xmm2, xmm2,0FFh
|
||||
pshufhw xmm2, xmm0, 0FFh // 8 alpha words
|
||||
pshuflw xmm2, xmm2, 0FFh
|
||||
pmulhuw xmm0, xmm2 // rgb * a
|
||||
movdqa xmm1, [eax] // read 4 pixels
|
||||
punpckhbw xmm1, xmm1 // next 2 pixels
|
||||
pshufhw xmm2, xmm1,0FFh // 8 alpha words
|
||||
pshuflw xmm2, xmm2,0FFh
|
||||
pshufhw xmm2, xmm1, 0FFh // 8 alpha words
|
||||
pshuflw xmm2, xmm2, 0FFh
|
||||
pmulhuw xmm1, xmm2 // rgb * a
|
||||
movdqa xmm2, [eax] // alphas
|
||||
psrlw xmm0, 8
|
||||
@ -4673,8 +4673,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
|
||||
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
|
||||
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words. 1, a, a, a
|
||||
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
|
||||
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
|
||||
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
|
||||
movlhps xmm2, xmm3
|
||||
pmulhuw xmm0, xmm2 // rgb * a
|
||||
|
||||
@ -4684,8 +4684,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
movd xmm2, dword ptr fixed_invtbl8[esi * 4]
|
||||
movd xmm3, dword ptr fixed_invtbl8[edi * 4]
|
||||
pshuflw xmm2, xmm2,040h // first 4 inv_alpha words
|
||||
pshuflw xmm3, xmm3,040h // next 4 inv_alpha words
|
||||
pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
|
||||
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
|
||||
movlhps xmm2, xmm3
|
||||
pmulhuw xmm1, xmm2 // rgb * a
|
||||
|
||||
|
||||
@ -424,46 +424,86 @@ void ScaleARGBFilterRows_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
|
||||
// TODO(fbarchard): Port to Neon
|
||||
// TODO(fbarchard): Port to Posix
|
||||
// TODO(fbarchard): Unroll for 2 pixels for better pairing and memory access.
|
||||
// TODO(fbarchard): Consider lea to get 2nd pixel without incrementing.
|
||||
|
||||
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
|
||||
static const uvec8 kShuffleColARGB = {
|
||||
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
|
||||
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
|
||||
};
|
||||
|
||||
// Shuffle table for duplicating 2 fractions into 8 bytes each
|
||||
static const uvec8 kShuffleFractions = {
|
||||
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 2u, 2u, 2u, 2u, 2u, 2u, 2u, 2u,
|
||||
};
|
||||
|
||||
#define HAS_SCALEARGBFILTERCOLS_SSSE3
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
__asm {
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
mov edi, [esp + 12 + 4] // dst_argb
|
||||
mov esi, [esp + 12 + 8] // src_argb
|
||||
mov ecx, [esp + 12 + 12] // dst_width
|
||||
mov edx, [esp + 12 + 16] // x
|
||||
mov ebx, [esp + 12 + 20] // dx
|
||||
mov edi, [esp + 16 + 4] // dst_argb
|
||||
mov esi, [esp + 16 + 8] // src_argb
|
||||
mov ecx, [esp + 16 + 12] // dst_width
|
||||
mov edx, [esp + 16 + 16] // x
|
||||
mov ebx, [esp + 16 + 20] // dx
|
||||
movdqa xmm3, kShuffleFractions
|
||||
movdqa xmm4, kShuffleColARGB
|
||||
pcmpeqb xmm5, xmm5 // generate 0x007f for inverting fraction.
|
||||
psrlw xmm5, 9
|
||||
sub ecx, 2
|
||||
jl xloop29
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
mov eax, edx // get x integer offset
|
||||
shr eax, 16
|
||||
movq xmm0, qword ptr [esi + eax * 4] // 2 source pixels
|
||||
pshufd xmm1, xmm0, 1 // second pixel
|
||||
punpcklbw xmm0, xmm1 // aarrggbb
|
||||
movd xmm2, edx // get x fraction
|
||||
psrlw xmm2, 9 // 7 bit fraction
|
||||
punpcklbw xmm2, xmm2
|
||||
punpcklwd xmm2, xmm2
|
||||
pshufd xmm2, xmm2, 0
|
||||
pxor xmm2, xmm5 // 0..7f and 7f..0
|
||||
pmaddubsw xmm0, xmm2
|
||||
xloop2:
|
||||
mov eax, edx // get x0 integer
|
||||
movd xmm1, edx // get x0 fraction
|
||||
lea ebp, [edx + ebx] // get x1 integer (x + dx)
|
||||
movd xmm2, ebp // get x1 fraction
|
||||
shr eax, 16 // x0
|
||||
punpcklwd xmm1, xmm2 // x0x1 fractions
|
||||
lea edx, [edx + ebx * 2] // x += dx * 2
|
||||
shr ebp, 16 // x1
|
||||
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
||||
movhps xmm0, qword ptr [esi + ebp * 4] // 2 source x1 pixels
|
||||
psrlw xmm1, 9 // 7 bit fractions.
|
||||
pshufb xmm1, xmm3 // 0000000011111111
|
||||
sub ecx, 2
|
||||
pshufb xmm0, xmm4 // arrange pixels into pairs
|
||||
pxor xmm1, xmm5 // 0..7f and 7f..0
|
||||
pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
|
||||
psrlw xmm0, 7
|
||||
packuswb xmm0, xmm0
|
||||
add edx, ebx // x += dx
|
||||
sub ecx, 1
|
||||
packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
|
||||
movq qword ptr [edi], xmm0
|
||||
lea edi, [edi + 8]
|
||||
jge xloop2
|
||||
xloop29:
|
||||
|
||||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
// 1 pixel remainder
|
||||
mov eax, edx // get x0 integer
|
||||
movd xmm1, edx // get x0 fraction
|
||||
shr eax, 16 // x0
|
||||
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
||||
psrlw xmm1, 9 // 7 bit fractions.
|
||||
pshufb xmm1, xmm3 // 00000000
|
||||
pshufb xmm0, xmm4 // arrange pixels into pairs
|
||||
pxor xmm1, xmm5 // 0..7f and 7f..0
|
||||
pmaddubsw xmm0, xmm1 // argb 16 bit, 1 pixel.
|
||||
psrlw xmm0, 7
|
||||
packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
|
||||
movd [edi], xmm0
|
||||
lea edi, [edi + 4]
|
||||
jg xloop
|
||||
xloop99:
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
@ -1104,8 +1144,6 @@ static void ScaleARGBBilinear(int src_width, int src_height,
|
||||
ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int dx = (src_width << 16) / dst_width;
|
||||
int dy = (src_height << 16) / dst_height;
|
||||
int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
|
||||
|
||||
@ -410,7 +410,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) {
|
||||
dst_width, dst_height,
|
||||
kFilterBilinear,
|
||||
benchmark_iterations_);
|
||||
EXPECT_LE(max_diff, 1);
|
||||
EXPECT_LE(max_diff, 3);
|
||||
}
|
||||
|
||||
TEST_F(libyuvTest, ARGBScaleFrom640x360_None) {
|
||||
@ -436,7 +436,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) {
|
||||
dst_width, dst_height,
|
||||
kFilterBilinear,
|
||||
benchmark_iterations_);
|
||||
EXPECT_LE(max_diff, 2);
|
||||
EXPECT_LE(max_diff, 3);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user