mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
use rounding in scaledown by 2
When scaling down by 2 the formula should round consistently. (a+b+c+d+2)/4 The C version did but the SSE2 version was doing 2 averages. avg(avg(a,b),avg(c,d)) This change uses a sum, then rounds. R=dhrosa@google.com, harryjin@google.com BUG=libyuv:447,libyuv:527 Review URL: https://codereview.chromium.org/1513183004 .
This commit is contained in:
parent
71b60123dc
commit
ae55e41851
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1553
|
||||
Version: 1554
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ extern "C" {
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
|
||||
#define HAS_SCALECOLSUP2_SSE2
|
||||
#define HAS_SCALEFILTERCOLS_SSSE3
|
||||
#define HAS_SCALEROWDOWN2_SSE2
|
||||
#define HAS_SCALEROWDOWN2_SSSE3
|
||||
#define HAS_SCALEROWDOWN34_SSSE3
|
||||
#define HAS_SCALEROWDOWN38_SSSE3
|
||||
#define HAS_SCALEROWDOWN4_SSE2
|
||||
@ -232,12 +232,12 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
// Specialized scalers for x86.
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
@ -269,11 +269,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1553
|
||||
#define LIBYUV_VERSION 1554
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -699,11 +699,11 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowDown2 = ScaleRowDown2Box_Any_SSE2;
|
||||
#if defined(HAS_SCALEROWDOWN2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3;
|
||||
if (IS_ALIGNED(halfwidth, 16)) {
|
||||
ScaleRowDown2 = ScaleRowDown2Box_SSE2;
|
||||
ScaleRowDown2 = ScaleRowDown2Box_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -61,15 +61,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 :
|
||||
ScaleRowDown2Box_Any_SSE2);
|
||||
#if defined(HAS_SCALEROWDOWN2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
|
||||
ScaleRowDown2Box_Any_SSSE3);
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
||||
ScaleRowDown2Box_SSE2);
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
|
||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
|
||||
ScaleRowDown2Box_SSSE3);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -55,11 +55,11 @@ CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
|
||||
dst_ptr + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_SSE2
|
||||
SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2,
|
||||
#ifdef HAS_SCALEROWDOWN2_SSSE3
|
||||
SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
|
||||
ScaleRowDown2Linear_C, 2, 1, 15)
|
||||
SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
|
||||
SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
|
||||
2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
|
||||
@ -98,8 +98,8 @@ static uvec16 kScaleAb2 =
|
||||
// Generated using gcc disassembly on Visual C object file:
|
||||
// objdump -D yuvscaler.obj >yuvscaler.txt
|
||||
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -120,26 +120,24 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm2 \n"
|
||||
"pand %%xmm5,%%xmm3 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pavgw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
@ -147,15 +145,17 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm5"
|
||||
:: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -164,17 +164,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
||||
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm2 \n"
|
||||
"pand %%xmm5,%%xmm3 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x1,%%xmm0 \n"
|
||||
"psrlw $0x1,%%xmm1 \n"
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pavgw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
|
||||
@ -95,8 +95,8 @@ static uvec16 kScaleAb2 =
|
||||
|
||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride ignored
|
||||
@ -121,31 +121,28 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Blends 32x1 rectangle to 16x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
// src_stride
|
||||
mov edx, [esp + 12] // dst_ptr
|
||||
mov ecx, [esp + 16] // dst_width
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||
psrlw xmm4, 15
|
||||
packuswb xmm4, xmm4
|
||||
pxor xmm5, xmm5 // constant 0
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||
psrlw xmm0, 8
|
||||
movdqa xmm3, xmm1
|
||||
psrlw xmm1, 8
|
||||
pand xmm2, xmm5
|
||||
pand xmm3, xmm5
|
||||
pavgw xmm0, xmm2
|
||||
pavgw xmm1, xmm3
|
||||
pmaddubsw xmm0, xmm4 // horizontal add
|
||||
pmaddubsw xmm1, xmm4
|
||||
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||
pavgw xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@ -157,16 +154,19 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
// Blends 32x2 rectangle to 16x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
push esi
|
||||
mov eax, [esp + 4 + 4] // src_ptr
|
||||
mov esi, [esp + 4 + 8] // src_stride
|
||||
mov edx, [esp + 4 + 12] // dst_ptr
|
||||
mov ecx, [esp + 4 + 16] // dst_width
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
pcmpeqb xmm4, xmm4 // constant 0x0101
|
||||
psrlw xmm4, 15
|
||||
packuswb xmm4, xmm4
|
||||
pxor xmm5, xmm5 // constant 0
|
||||
|
||||
wloop:
|
||||
movdqu xmm0, [eax]
|
||||
@ -174,19 +174,17 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
movdqu xmm2, [eax + esi]
|
||||
movdqu xmm3, [eax + esi + 16]
|
||||
lea eax, [eax + 32]
|
||||
pavgb xmm0, xmm2 // average rows
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||
psrlw xmm0, 8
|
||||
movdqa xmm3, xmm1
|
||||
psrlw xmm1, 8
|
||||
pand xmm2, xmm5
|
||||
pand xmm3, xmm5
|
||||
pavgw xmm0, xmm2
|
||||
pavgw xmm1, xmm3
|
||||
pmaddubsw xmm0, xmm4 // horizontal add
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
paddw xmm0, xmm2 // vertical add
|
||||
paddw xmm1, xmm3
|
||||
psrlw xmm0, 1
|
||||
psrlw xmm1, 1
|
||||
pavgw xmm0, xmm5 // (x + 1) / 2
|
||||
pavgw xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@ -245,14 +243,12 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
lea eax, [eax + 64]
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
@ -263,6 +259,8 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
// For rounding, average = (sum + 2) / 4
|
||||
// becomes average((sum >> 1), 0)
|
||||
// Blends 64x2 rectangle to 32x1.
|
||||
__declspec(naked)
|
||||
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
@ -280,19 +278,23 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
|
||||
wloop:
|
||||
vmovdqu ymm0, [eax] // average rows
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
vmovdqu ymm2, [eax + esi]
|
||||
vmovdqu ymm3, [eax + esi + 32]
|
||||
lea eax, [eax + 64]
|
||||
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // average horizontally
|
||||
vpmaddubsw ymm0, ymm0, ymm4 // horizontal add
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
vpaddw ymm0, ymm0, ymm2 // vertical add
|
||||
vpaddw ymm1, ymm1, ymm3
|
||||
vpsrlw ymm0, ymm0, 1
|
||||
vpsrlw ymm1, ymm1, 1
|
||||
vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2
|
||||
vpavgw ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
|
||||
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
|
||||
@ -1422,8 +1422,8 @@ static void TestI420Blend(int width, int height, int benchmark_iterations,
|
||||
EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]);
|
||||
}
|
||||
for (int i = 0; i < kSizeUV; ++i) {
|
||||
EXPECT_NEAR(dst_u_c[i + off], dst_u_opt[i + off], 1); // Subsample off by 1
|
||||
EXPECT_NEAR(dst_v_c[i + off], dst_v_opt[i + off], 1);
|
||||
EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]);
|
||||
EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]);
|
||||
}
|
||||
free_aligned_buffer_64(src_y0);
|
||||
free_aligned_buffer_64(src_u0);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user