mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
change scale down by 4 to use rounding.
TBR=harryjin@google.com BUG=libyuv:447 Review URL: https://codereview.chromium.org/1525033005 .
This commit is contained in:
parent
be984a8b9a
commit
80ca4514ef
@ -59,7 +59,7 @@ extern "C" {
|
|||||||
#define HAS_SCALEROWDOWN2_SSSE3
|
#define HAS_SCALEROWDOWN2_SSSE3
|
||||||
#define HAS_SCALEROWDOWN34_SSSE3
|
#define HAS_SCALEROWDOWN34_SSSE3
|
||||||
#define HAS_SCALEROWDOWN38_SSSE3
|
#define HAS_SCALEROWDOWN38_SSSE3
|
||||||
#define HAS_SCALEROWDOWN4_SSE2
|
#define HAS_SCALEROWDOWN4_SSSE3
|
||||||
#define HAS_SCALEADDROW_SSE2
|
#define HAS_SCALEADDROW_SSE2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -244,9 +244,9 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
@ -281,9 +281,9 @@ void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
|
|||||||
@ -47,7 +47,7 @@
|
|||||||
# Change type to 'shared_library' to build .so or .dll files.
|
# Change type to 'shared_library' to build .so or .dll files.
|
||||||
'type': 'static_library',
|
'type': 'static_library',
|
||||||
'variables': {
|
'variables': {
|
||||||
'optimize': 'max', # enable O2 and ltcg.
|
# 'optimize': 'max', # enable O2 and ltcg.
|
||||||
},
|
},
|
||||||
# Allows libyuv.a redistributable library without external dependencies.
|
# Allows libyuv.a redistributable library without external dependencies.
|
||||||
'standalone_static_library': 1,
|
'standalone_static_library': 1,
|
||||||
|
|||||||
@ -182,12 +182,12 @@ static void ScalePlaneDown4(int src_width, int src_height,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_SCALEROWDOWN4_SSE2)
|
#if defined(HAS_SCALEROWDOWN4_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
ScaleRowDown4 = filtering ?
|
ScaleRowDown4 = filtering ?
|
||||||
ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2;
|
ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width, 8)) {
|
if (IS_ALIGNED(dst_width, 8)) {
|
||||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
|
ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -76,9 +76,9 @@ SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
|
|||||||
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
|
SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
|
||||||
ScaleRowDown2Box_C, 2, 1, 15)
|
ScaleRowDown2Box_C, 2, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_SCALEROWDOWN4_SSE2
|
#ifdef HAS_SCALEROWDOWN4_SSSE3
|
||||||
SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7)
|
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
|
||||||
SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C,
|
SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
|
||||||
4, 1, 7)
|
4, 1, 7)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_SCALEROWDOWN4_AVX2
|
#ifdef HAS_SCALEROWDOWN4_AVX2
|
||||||
|
|||||||
@ -286,7 +286,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
}
|
}
|
||||||
#endif // HAS_SCALEROWDOWN2_AVX2
|
#endif // HAS_SCALEROWDOWN2_AVX2
|
||||||
|
|
||||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||||
@ -314,12 +314,15 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
intptr_t stridex3 = 0;
|
intptr_t stridex3 = 0;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||||
"psrlw $0x8,%%xmm7 \n"
|
"psrlw $0xf,%%xmm4 \n"
|
||||||
|
"movdqa %%xmm4,%%xmm5 \n"
|
||||||
|
"packuswb %%xmm4,%%xmm4 \n"
|
||||||
|
"psllw $0x3,%%xmm5 \n"
|
||||||
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
|
"lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -328,30 +331,28 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
|
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
|
||||||
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
|
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||||
"pavgb %%xmm3,%%xmm1 \n"
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||||
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||||
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||||
|
"paddw %%xmm2,%%xmm0 \n"
|
||||||
|
"paddw %%xmm3,%%xmm1 \n"
|
||||||
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
|
MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
|
||||||
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
|
MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
|
||||||
MEMOPREG(movdqu,0x00,0,3,1,xmm4) // movdqu (%0,%3,1),%%xmm4
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||||
MEMOPREG(movdqu,0x10,0,3,1,xmm5) // movdqu 0x10(%0,%3,1),%%xmm5
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||||
|
"paddw %%xmm2,%%xmm0 \n"
|
||||||
|
"paddw %%xmm3,%%xmm1 \n"
|
||||||
|
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
||||||
|
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"pavgb %%xmm4,%%xmm2 \n"
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||||
"pavgb %%xmm5,%%xmm3 \n"
|
"paddw %%xmm2,%%xmm0 \n"
|
||||||
"pavgb %%xmm3,%%xmm1 \n"
|
"paddw %%xmm3,%%xmm1 \n"
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
"phaddw %%xmm1,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"paddw %%xmm5,%%xmm0 \n"
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
"psrlw $0x4,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
|
||||||
"pand %%xmm7,%%xmm2 \n"
|
|
||||||
"pand %%xmm7,%%xmm3 \n"
|
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
|
||||||
"pand %%xmm7,%%xmm2 \n"
|
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
|
||||||
"packuswb %%xmm0,%%xmm0 \n"
|
"packuswb %%xmm0,%%xmm0 \n"
|
||||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||||
@ -363,7 +364,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
"+r"(stridex3) // %3
|
"+r"(stridex3) // %3
|
||||||
: "r"((intptr_t)(src_stride)) // %4
|
: "r"((intptr_t)(src_stride)) // %4
|
||||||
: "memory", "cc", NACL_R14
|
: "memory", "cc", NACL_R14
|
||||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
|
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -309,7 +309,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
// Point samples 32 pixels to 8 pixels.
|
// Point samples 32 pixels to 8 pixels.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_ptr
|
mov eax, [esp + 4] // src_ptr
|
||||||
@ -340,7 +340,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
// Blends 32x4 rectangle to 8x1.
|
// Blends 32x4 rectangle to 8x1.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
@ -350,42 +350,40 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
mov edx, [esp + 8 + 12] // dst_ptr
|
mov edx, [esp + 8 + 12] // dst_ptr
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
mov ecx, [esp + 8 + 16] // dst_width
|
||||||
lea edi, [esi + esi * 2] // src_stride * 3
|
lea edi, [esi + esi * 2] // src_stride * 3
|
||||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||||
psrlw xmm7, 8
|
psrlw xmm4, 15
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
packuswb xmm4, xmm4
|
||||||
|
psllw xmm5, 3 // constant 0x0008
|
||||||
|
|
||||||
wloop:
|
wloop:
|
||||||
movdqu xmm0, [eax] // average rows
|
movdqu xmm0, [eax] // average rows
|
||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
movdqu xmm2, [eax + esi]
|
movdqu xmm2, [eax + esi]
|
||||||
movdqu xmm3, [eax + esi + 16]
|
movdqu xmm3, [eax + esi + 16]
|
||||||
pavgb xmm0, xmm2
|
pmaddubsw xmm0, xmm4 // horizontal add
|
||||||
pavgb xmm1, xmm3
|
pmaddubsw xmm1, xmm4
|
||||||
|
pmaddubsw xmm2, xmm4
|
||||||
|
pmaddubsw xmm3, xmm4
|
||||||
|
paddw xmm0, xmm2 // vertical add rows 0, 1
|
||||||
|
paddw xmm1, xmm3
|
||||||
movdqu xmm2, [eax + esi * 2]
|
movdqu xmm2, [eax + esi * 2]
|
||||||
movdqu xmm3, [eax + esi * 2 + 16]
|
movdqu xmm3, [eax + esi * 2 + 16]
|
||||||
movdqu xmm4, [eax + edi]
|
pmaddubsw xmm2, xmm4
|
||||||
movdqu xmm5, [eax + edi + 16]
|
pmaddubsw xmm3, xmm4
|
||||||
|
paddw xmm0, xmm2 // add row 2
|
||||||
|
paddw xmm1, xmm3
|
||||||
|
movdqu xmm2, [eax + edi]
|
||||||
|
movdqu xmm3, [eax + edi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm2, xmm4
|
pmaddubsw xmm2, xmm4
|
||||||
pavgb xmm3, xmm5
|
pmaddubsw xmm3, xmm4
|
||||||
pavgb xmm0, xmm2
|
paddw xmm0, xmm2 // add row 3
|
||||||
pavgb xmm1, xmm3
|
paddw xmm1, xmm3
|
||||||
|
phaddw xmm0, xmm1
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
paddw xmm0, xmm5 // + 8 for round
|
||||||
psrlw xmm0, 8
|
psrlw xmm0, 4 // /16 for average of 4 * 4
|
||||||
movdqa xmm3, xmm1
|
|
||||||
psrlw xmm1, 8
|
|
||||||
pand xmm2, xmm7
|
|
||||||
pand xmm3, xmm7
|
|
||||||
pavgw xmm0, xmm2
|
|
||||||
pavgw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
|
|
||||||
movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
|
|
||||||
psrlw xmm0, 8
|
|
||||||
pand xmm2, xmm7
|
|
||||||
pavgw xmm0, xmm2
|
|
||||||
packuswb xmm0, xmm0
|
packuswb xmm0, xmm0
|
||||||
|
|
||||||
movq qword ptr [edx], xmm0
|
movq qword ptr [edx], xmm0
|
||||||
lea edx, [edx + 8]
|
lea edx, [edx + 8]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
@ -444,37 +442,41 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
mov edx, [esp + 8 + 12] // dst_ptr
|
mov edx, [esp + 8 + 12] // dst_ptr
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
mov ecx, [esp + 8 + 16] // dst_width
|
||||||
lea edi, [esi + esi * 2] // src_stride * 3
|
lea edi, [esi + esi * 2] // src_stride * 3
|
||||||
vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff
|
vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101
|
||||||
vpsrlw ymm7, ymm7, 8
|
vpsrlw ymm4, ymm4, 15
|
||||||
|
vpsllw ymm5, ymm4, 3 // constant 0x0008
|
||||||
|
vpackuswb ymm4, ymm4, ymm4
|
||||||
|
|
||||||
wloop:
|
wloop:
|
||||||
vmovdqu ymm0, [eax] // average rows
|
vmovdqu ymm0, [eax] // average rows
|
||||||
vmovdqu ymm1, [eax + 32]
|
vmovdqu ymm1, [eax + 32]
|
||||||
vpavgb ymm0, ymm0, [eax + esi]
|
vmovdqu ymm2, [eax + esi]
|
||||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
vmovdqu ymm3, [eax + esi + 32]
|
||||||
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||||
|
vpmaddubsw ymm1, ymm1, ymm4
|
||||||
|
vpmaddubsw ymm2, ymm2, ymm4
|
||||||
|
vpmaddubsw ymm3, ymm3, ymm4
|
||||||
|
vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1
|
||||||
|
vpaddw ymm1, ymm1, ymm3
|
||||||
vmovdqu ymm2, [eax + esi * 2]
|
vmovdqu ymm2, [eax + esi * 2]
|
||||||
vmovdqu ymm3, [eax + esi * 2 + 32]
|
vmovdqu ymm3, [eax + esi * 2 + 32]
|
||||||
vpavgb ymm2, ymm2, [eax + edi]
|
vpmaddubsw ymm2, ymm2, ymm4
|
||||||
vpavgb ymm3, ymm3, [eax + edi + 32]
|
vpmaddubsw ymm3, ymm3, ymm4
|
||||||
|
vpaddw ymm0, ymm0, ymm2 // add row 2
|
||||||
|
vpaddw ymm1, ymm1, ymm3
|
||||||
|
vmovdqu ymm2, [eax + edi]
|
||||||
|
vmovdqu ymm3, [eax + edi + 32]
|
||||||
lea eax, [eax + 64]
|
lea eax, [eax + 64]
|
||||||
vpavgb ymm0, ymm0, ymm2
|
vpmaddubsw ymm2, ymm2, ymm4
|
||||||
vpavgb ymm1, ymm1, ymm3
|
vpmaddubsw ymm3, ymm3, ymm4
|
||||||
|
vpaddw ymm0, ymm0, ymm2 // add row 3
|
||||||
vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels)
|
vpaddw ymm1, ymm1, ymm3
|
||||||
vpand ymm3, ymm1, ymm7
|
vphaddw ymm0, ymm0, ymm1 // mutates
|
||||||
vpsrlw ymm0, ymm0, 8
|
vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw
|
||||||
vpsrlw ymm1, ymm1, 8
|
vpaddw ymm0, ymm0, ymm5 // + 8 for round
|
||||||
vpavgw ymm0, ymm0, ymm2
|
vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4
|
||||||
vpavgw ymm1, ymm1, ymm3
|
|
||||||
vpackuswb ymm0, ymm0, ymm1
|
|
||||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
|
||||||
|
|
||||||
vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels)
|
|
||||||
vpsrlw ymm0, ymm0, 8
|
|
||||||
vpavgw ymm0, ymm0, ymm2
|
|
||||||
vpackuswb ymm0, ymm0, ymm0
|
vpackuswb ymm0, ymm0, ymm0
|
||||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||||
|
|
||||||
vmovdqu [edx], xmm0
|
vmovdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
|
|||||||
@ -298,17 +298,17 @@ static int TestFilter_16(int src_width, int src_height,
|
|||||||
|
|
||||||
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
|
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
|
||||||
// filtering is different fixed point implementations for SSSE3, Neon and C.
|
// filtering is different fixed point implementations for SSSE3, Neon and C.
|
||||||
#define TEST_FACTOR(name, nom, denom, maxdiff) \
|
#define TEST_FACTOR(name, nom, denom, boxdiff) \
|
||||||
TEST_FACTOR1(name, None, nom, denom, 0) \
|
TEST_FACTOR1(name, None, nom, denom, 0) \
|
||||||
TEST_FACTOR1(name, Linear, nom, denom, maxdiff) \
|
TEST_FACTOR1(name, Linear, nom, denom, 3) \
|
||||||
TEST_FACTOR1(name, Bilinear, nom, denom, maxdiff) \
|
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
|
||||||
TEST_FACTOR1(name, Box, nom, denom, maxdiff)
|
TEST_FACTOR1(name, Box, nom, denom, boxdiff)
|
||||||
|
|
||||||
TEST_FACTOR(2, 1, 2, 0)
|
TEST_FACTOR(2, 1, 2, 0)
|
||||||
TEST_FACTOR(4, 1, 4, 3)
|
TEST_FACTOR(4, 1, 4, 0)
|
||||||
TEST_FACTOR(8, 1, 8, 3)
|
TEST_FACTOR(8, 1, 8, 3)
|
||||||
TEST_FACTOR(3by4, 3, 4, 3)
|
TEST_FACTOR(3by4, 3, 4, 1)
|
||||||
TEST_FACTOR(3by8, 3, 8, 3)
|
TEST_FACTOR(3by8, 3, 8, 1)
|
||||||
TEST_FACTOR(3, 1, 3, 3)
|
TEST_FACTOR(3, 1, 3, 3)
|
||||||
#undef TEST_FACTOR1
|
#undef TEST_FACTOR1
|
||||||
#undef TEST_FACTOR
|
#undef TEST_FACTOR
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user