mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-08 18:56:43 +08:00
use rounding in scaledown by 2
When scaling down by 2 the formula should round consistently. (a+b+c+d+2)/4 The C version did but the SSE2 version was doing 2 averages. avg(avg(a,b),avg(c,d)) This change uses a sum, then rounds. R=dhrosa@google.com, harryjin@google.com BUG=libyuv:447,libyuv:527 Review URL: https://codereview.chromium.org/1513183004 .
This commit is contained in:
parent
71b60123dc
commit
ae55e41851
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1553
|
Version: 1554
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -56,7 +56,7 @@ extern "C" {
|
|||||||
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||||
#define HAS_SCALECOLSUP2_SSE2
|
#define HAS_SCALECOLSUP2_SSE2
|
||||||
#define HAS_SCALEFILTERCOLS_SSSE3
|
#define HAS_SCALEFILTERCOLS_SSSE3
|
||||||
#define HAS_SCALEROWDOWN2_SSE2
|
#define HAS_SCALEROWDOWN2_SSSE3
|
||||||
#define HAS_SCALEROWDOWN34_SSSE3
|
#define HAS_SCALEROWDOWN34_SSSE3
|
||||||
#define HAS_SCALEROWDOWN38_SSSE3
|
#define HAS_SCALEROWDOWN38_SSSE3
|
||||||
#define HAS_SCALEROWDOWN4_SSE2
|
#define HAS_SCALEROWDOWN4_SSE2
|
||||||
@ -232,12 +232,12 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
|
|||||||
int dst_width, int x, int dx);
|
int dst_width, int x, int dx);
|
||||||
|
|
||||||
// Specialized scalers for x86.
|
// Specialized scalers for x86.
|
||||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
|||||||
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width);
|
uint8* dst_ptr, int dst_width);
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1553
|
#define LIBYUV_VERSION 1554
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_SCALEROWDOWN2_SSE2)
|
#if defined(HAS_SCALEROWDOWN2_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2;
|
ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
|
||||||
if (IS_ALIGNED(halfwidth, 16)) {
|
if (IS_ALIGNED(halfwidth, 16)) {
|
||||||
ScaleRowDown2 = ScaleRowDown2Box_SSE2;
|
ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_SCALEROWDOWN2_SSE2)
|
#if defined(HAS_SCALEROWDOWN2_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
|
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
|
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
|
||||||
ScaleRowDown2Box_Any_SSE2);
|
ScaleRowDown2Box_Any_SSSE3);
|
||||||
if (IS_ALIGNED(dst_width, 16)) {
|
if (IS_ALIGNED(dst_width, 16)) {
|
||||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
|
||||||
ScaleRowDown2Box_SSE2);
|
ScaleRowDown2Box_SSSE3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
|
|||||||
dst_ptr + n * BPP, r); \
|
dst_ptr + n * BPP, r); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_SCALEROWDOWN2_SSE2
|
#ifdef HAS_SCALEROWDOWN2_SSSE3
|
||||||
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
|
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
|
||||||
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
|
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
|
||||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||||
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
|
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
|
||||||
2, 1, 15)
|
2, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||||
|
|||||||
@ -98,8 +98,8 @@ static uvec16 kScaleAb2 =
|
|||||||
// Generated using gcc disassembly on Visual C object file:
|
// Generated using gcc disassembly on Visual C object file:
|
||||||
// objdump -D yuvscaler.obj >yuvscaler.txt
|
// objdump -D yuvscaler.obj >yuvscaler.txt
|
||||||
|
|
||||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -120,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||||
"psrlw $0x8,%%xmm5 \n"
|
"psrlw $0xf,%%xmm4 \n"
|
||||||
|
"packuswb %%xmm4,%%xmm4 \n"
|
||||||
|
"pxor %%xmm5,%%xmm5 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
"pavgw %%xmm5,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
"pavgw %%xmm5,%%xmm1 \n"
|
||||||
"pand %%xmm5,%%xmm2 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"pand %%xmm5,%%xmm3 \n"
|
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
: "+r"(src_ptr), // %0
|
: "+r"(src_ptr), // %0
|
||||||
"+r"(dst_ptr), // %1
|
"+r"(dst_ptr), // %1
|
||||||
"+r"(dst_width) // %2
|
"+r"(dst_width) // %2
|
||||||
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||||
"psrlw $0x8,%%xmm5 \n"
|
"psrlw $0xf,%%xmm4 \n"
|
||||||
|
"packuswb %%xmm4,%%xmm4 \n"
|
||||||
|
"pxor %%xmm5,%%xmm5 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -164,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
||||||
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||||
"pavgb %%xmm3,%%xmm1 \n"
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
"paddw %%xmm2,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
"paddw %%xmm3,%%xmm1 \n"
|
||||||
"pand %%xmm5,%%xmm2 \n"
|
"psrlw $0x1,%%xmm0 \n"
|
||||||
"pand %%xmm5,%%xmm3 \n"
|
"psrlw $0x1,%%xmm1 \n"
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
"pavgw %%xmm5,%%xmm0 \n"
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
"pavgw %%xmm5,%%xmm1 \n"
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
|
|||||||
@ -95,8 +95,8 @@ static uvec16 kScaleAb2 =
|
|||||||
|
|
||||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
// Reads 32 pixels, throws half away and writes 16 pixels.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_ptr
|
mov eax, [esp + 4] // src_ptr
|
||||||
// src_stride ignored
|
// src_stride ignored
|
||||||
@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
// Blends 32x1 rectangle to 16x1.
|
// Blends 32x1 rectangle to 16x1.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_ptr
|
mov eax, [esp + 4] // src_ptr
|
||||||
// src_stride
|
// src_stride
|
||||||
mov edx, [esp + 12] // dst_ptr
|
mov edx, [esp + 12] // dst_ptr
|
||||||
mov ecx, [esp + 16] // dst_width
|
mov ecx, [esp + 16] // dst_width
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
|
||||||
psrlw xmm5, 8
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||||
|
psrlw xmm4, 15
|
||||||
|
packuswb xmm4, xmm4
|
||||||
|
pxor xmm5, xmm5 // constant 0
|
||||||
|
|
||||||
wloop:
|
wloop:
|
||||||
movdqu xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
|
pmaddubsw xmm0, xmm4 // horizontal add
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
pmaddubsw xmm1, xmm4
|
||||||
psrlw xmm0, 8
|
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||||
movdqa xmm3, xmm1
|
pavgw xmm1, xmm5
|
||||||
psrlw xmm1, 8
|
|
||||||
pand xmm2, xmm5
|
|
||||||
pand xmm3, xmm5
|
|
||||||
pavgw xmm0, xmm2
|
|
||||||
pavgw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
|
|
||||||
movdqu [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
@ -157,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
// Blends 32x2 rectangle to 16x1.
|
// Blends 32x2 rectangle to 16x1.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
uint8* dst_ptr, int dst_width) {
|
uint8* dst_ptr, int dst_width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_ptr
|
mov eax, [esp + 4 + 4] // src_ptr
|
||||||
mov esi, [esp + 4 + 8] // src_stride
|
mov esi, [esp + 4 + 8] // src_stride
|
||||||
mov edx, [esp + 4 + 12] // dst_ptr
|
mov edx, [esp + 4 + 12] // dst_ptr
|
||||||
mov ecx, [esp + 4 + 16] // dst_width
|
mov ecx, [esp + 4 + 16] // dst_width
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
|
||||||
psrlw xmm5, 8
|
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||||
|
psrlw xmm4, 15
|
||||||
|
packuswb xmm4, xmm4
|
||||||
|
pxor xmm5, xmm5 // constant 0
|
||||||
|
|
||||||
wloop:
|
wloop:
|
||||||
movdqu xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
movdqu xmm2, [eax + esi]
|
movdqu xmm2, [eax + esi]
|
||||||
movdqu xmm3, [eax + esi + 16]
|
movdqu xmm3, [eax + esi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm0, xmm2 // average rows
|
pmaddubsw xmm0, xmm4 // horizontal add
|
||||||
pavgb xmm1, xmm3
|
pmaddubsw xmm1, xmm4
|
||||||
|
pmaddubsw xmm2, xmm4
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
pmaddubsw xmm3, xmm4
|
||||||
psrlw xmm0, 8
|
paddw xmm0, xmm2 // vertical add
|
||||||
movdqa xmm3, xmm1
|
paddw xmm1, xmm3
|
||||||
psrlw xmm1, 8
|
psrlw xmm0, 1
|
||||||
pand xmm2, xmm5
|
psrlw xmm1, 1
|
||||||
pand xmm3, xmm5
|
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||||
pavgw xmm0, xmm2
|
pavgw xmm1, xmm5
|
||||||
pavgw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
|
|
||||||
movdqu [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
vmovdqu ymm0, [eax]
|
vmovdqu ymm0, [eax]
|
||||||
vmovdqu ymm1, [eax + 32]
|
vmovdqu ymm1, [eax + 32]
|
||||||
lea eax, [eax + 64]
|
lea eax, [eax + 64]
|
||||||
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
|
||||||
vpmaddubsw ymm1, ymm1, ymm4
|
vpmaddubsw ymm1, ymm1, ymm4
|
||||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||||
vpavgw ymm1, ymm1, ymm5
|
vpavgw ymm1, ymm1, ymm5
|
||||||
vpackuswb ymm0, ymm0, ymm1
|
vpackuswb ymm0, ymm0, ymm1
|
||||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||||
|
|
||||||
vmovdqu [edx], ymm0
|
vmovdqu [edx], ymm0
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 32
|
sub ecx, 32
|
||||||
@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For rounding, average = (sum + 2) / 4
|
||||||
|
// becomes average((sum >> 1), 0)
|
||||||
// Blends 64x2 rectangle to 32x1.
|
// Blends 64x2 rectangle to 32x1.
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||||
@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||||
|
|
||||||
wloop:
|
wloop:
|
||||||
vmovdqu ymm0, [eax] // average rows
|
vmovdqu ymm0, [eax]
|
||||||
vmovdqu ymm1, [eax + 32]
|
vmovdqu ymm1, [eax + 32]
|
||||||
vpavgb ymm0, ymm0, [eax + esi]
|
vmovdqu ymm2, [eax + esi]
|
||||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
vmovdqu ymm3, [eax + esi + 32]
|
||||||
lea eax, [eax + 64]
|
lea eax, [eax + 64]
|
||||||
|
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
|
||||||
vpmaddubsw ymm1, ymm1, ymm4
|
vpmaddubsw ymm1, ymm1, ymm4
|
||||||
|
vpmaddubsw ymm2, ymm2, ymm4
|
||||||
|
vpmaddubsw ymm3, ymm3, ymm4
|
||||||
|
vpaddw ymm0, ymm0, ymm2 // vertical add
|
||||||
|
vpaddw ymm1, ymm1, ymm3
|
||||||
|
vpsrlw ymm0, ymm0, 1
|
||||||
|
vpsrlw ymm1, ymm1, 1
|
||||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||||
vpavgw ymm1, ymm1, ymm5
|
vpavgw ymm1, ymm1, ymm5
|
||||||
vpackuswb ymm0, ymm0, ymm1
|
vpackuswb ymm0, ymm0, ymm1
|
||||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||||
|
|
||||||
vmovdqu [edx], ymm0
|
vmovdqu [edx], ymm0
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 32
|
sub ecx, 32
|
||||||
|
|||||||
@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
|
|||||||
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
|
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
|
||||||
}
|
}
|
||||||
for (int i = 0; i < kSizeUV; ++i) {
|
for (int i = 0; i < kSizeUV; ++i) {
|
||||||
EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1
|
EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
|
||||||
EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1);
|
EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
|
||||||
}
|
}
|
||||||
free_aligned_buffer_64(src_y0);
|
free_aligned_buffer_64(src_y0);
|
||||||
free_aligned_buffer_64(src_u0);
|
free_aligned_buffer_64(src_u0);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user