Add rounding to InterpolateRow for improved quality and consistency.

Remove inaccurate specializations for 1/4 and 3/4, since they round
incorrectly.  Specialize for 100% and 50% are kept due to performance.
Make C and ARM code match SSSE3.
Make unittests expect zero difference.

BUG=libyuv:535
R=harryjin@google.com

Review URL: https://codereview.chromium.org/1533643005 .
This commit is contained in:
Frank Barchard 2015-12-17 15:24:06 -08:00
parent 1ccbf8fb7b
commit f4447745ae
12 changed files with 57 additions and 512 deletions

View File

@ -175,7 +175,6 @@ extern "C" {
#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2
#define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#define HAS_INTERPOLATEROW_SSE2
#define HAS_INTERPOLATEROW_SSSE3 #define HAS_INTERPOLATEROW_SSSE3
#define HAS_RGBCOLORTABLEROW_X86 #define HAS_RGBCOLORTABLEROW_X86
#define HAS_SOBELROW_SSE2 #define HAS_SOBELROW_SSE2
@ -1838,9 +1837,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, ptrdiff_t src_stride_ptr,
int width, int source_y_fraction); int width, int source_y_fraction);
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width, ptrdiff_t src_stride_ptr, int width,
int source_y_fraction); int source_y_fraction);
@ -1856,9 +1852,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width, ptrdiff_t src_stride_ptr, int width,
int source_y_fraction); int source_y_fraction);
void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width, ptrdiff_t src_stride_ptr, int width,
int source_y_fraction); int source_y_fraction);

View File

@ -1870,14 +1870,6 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
height = 1; height = 1;
src_stride0 = src_stride1 = dst_stride = 0; src_stride0 = src_stride1 = dst_stride = 0;
} }
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -2467,14 +2459,6 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -2571,14 +2555,6 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
} }
} }
#endif #endif
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;

View File

@ -593,9 +593,6 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
#ifdef HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSSE3
ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
#endif #endif
#ifdef HAS_INTERPOLATEROW_SSE2
ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
#endif
#ifdef HAS_INTERPOLATEROW_NEON #ifdef HAS_INTERPOLATEROW_NEON
ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
#endif #endif

View File

@ -2211,27 +2211,30 @@ static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
int width, int source_y_fraction) { int width, int source_y_fraction) {
int y1_fraction = source_y_fraction; int y1_fraction = source_y_fraction >> 1;
int y0_fraction = 256 - y1_fraction; int y0_fraction = 128 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
int x; int x;
if (source_y_fraction == 0) { if (y1_fraction == 0) {
memcpy(dst_ptr, src_ptr, width); memcpy(dst_ptr, src_ptr, width);
return; return;
} }
if (source_y_fraction == 128) { if (y1_fraction == 64) {
HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
return; return;
} }
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_ptr[0] =
dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7;
dst_ptr[1] =
(src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 64) >> 7;
src_ptr += 2; src_ptr += 2;
src_ptr1 += 2; src_ptr1 += 2;
dst_ptr += 2; dst_ptr += 2;
} }
if (width & 1) { if (width & 1) {
dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_ptr[0] =
(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7;
} }
} }

View File

@ -4794,12 +4794,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"shr %3 \n" "shr %3 \n"
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 100f \n" "je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 50f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n" "movd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
@ -4808,6 +4804,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpcklbw %%xmm0,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n"
"mov $0x400040,%%eax \n"
"movd %%eax,%%xmm4 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
// General purpose row blend. // General purpose row blend.
LABELALIGN LABELALIGN
@ -4819,6 +4818,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"punpckhbw %%xmm2,%%xmm1 \n" "punpckhbw %%xmm2,%%xmm1 \n"
"pmaddubsw %%xmm5,%%xmm0 \n" "pmaddubsw %%xmm5,%%xmm0 \n"
"pmaddubsw %%xmm5,%%xmm1 \n" "pmaddubsw %%xmm5,%%xmm1 \n"
"paddw %%xmm4,%%xmm0 \n"
"paddw %%xmm4,%%xmm1 \n"
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm1 \n" "psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
@ -4828,19 +4829,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"jg 1b \n" "jg 1b \n"
"jmp 99f \n" "jmp 99f \n"
// Blend 25 / 75.
LABELALIGN
"25: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
LABELALIGN LABELALIGN
"50: \n" "50: \n"
@ -4853,19 +4841,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"jg 50b \n" "jg 50b \n"
"jmp 99f \n" "jmp 99f \n"
// Blend 75 / 25.
LABELALIGN
"75: \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
LABELALIGN LABELALIGN
"100: \n" "100: \n"
@ -4881,8 +4856,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+r"(dst_width), // %2 "+r"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4 : "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14 : "memory", "cc", "eax", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm5" "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
); );
} }
#endif // HAS_INTERPOLATEROW_SSSE3 #endif // HAS_INTERPOLATEROW_SSSE3
@ -4897,12 +4872,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"cmp $0x0,%3 \n" "cmp $0x0,%3 \n"
"je 100f \n" "je 100f \n"
"sub %1,%0 \n" "sub %1,%0 \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n" "cmp $0x40,%3 \n"
"je 50f \n" "je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"vmovd %3,%%xmm0 \n" "vmovd %3,%%xmm0 \n"
"neg %3 \n" "neg %3 \n"
@ -4912,6 +4883,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpermd %%ymm5,%%ymm0,%%ymm5 \n" "vpermd %%ymm5,%%ymm0,%%ymm5 \n"
"mov $0x400040,%%eax \n"
"vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
// General purpose row blend. // General purpose row blend.
LABELALIGN LABELALIGN
@ -4922,6 +4896,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n"
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n" "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm1,%%ymm1 \n" "vpsrlw $0x7,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
@ -4931,19 +4907,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"jg 1b \n" "jg 1b \n"
"jmp 99f \n" "jmp 99f \n"
// Blend 25 / 75.
LABELALIGN
"25: \n"
"vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
"vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
LABELALIGN LABELALIGN
"50: \n" "50: \n"
@ -4955,19 +4918,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"jg 50b \n" "jg 50b \n"
"jmp 99f \n" "jmp 99f \n"
// Blend 75 / 25.
LABELALIGN
"75: \n"
"vmovdqu " MEMACCESS(1) ",%%ymm1 \n"
MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
"vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm1,%%ymm0,%%ymm0 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
LABELALIGN LABELALIGN
"100: \n" "100: \n"
@ -4982,123 +4932,12 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
"+c"(dst_width), // %2 "+c"(dst_width), // %2
"+r"(source_y_fraction) // %3 "+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4 : "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14 : "memory", "cc", "eax", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm5" "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
); );
} }
#endif // HAS_INTERPOLATEROW_AVX2 #endif // HAS_INTERPOLATEROW_AVX2
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
asm volatile (
"sub %1,%0 \n"
"shr %3 \n"
"cmp $0x0,%3 \n"
"je 100f \n"
"cmp $0x20,%3 \n"
"je 75f \n"
"cmp $0x40,%3 \n"
"je 50f \n"
"cmp $0x60,%3 \n"
"je 25f \n"
"movd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x80,%3 \n"
"movd %3,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
// General purpose row blend.
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm0 \n"
"punpckhbw %%xmm4,%%xmm1 \n"
"psubw %%xmm0,%%xmm2 \n"
"psubw %%xmm1,%%xmm3 \n"
"paddw %%xmm2,%%xmm2 \n"
"paddw %%xmm3,%%xmm3 \n"
"pmulhw %%xmm5,%%xmm2 \n"
"pmulhw %%xmm5,%%xmm3 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
LABELALIGN
"25: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
LABELALIGN
"50: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
LABELALIGN
"75: \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
LABELALIGN
"100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", NACL_R14
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#endif // HAS_INTERPOLATEROW_SSE2
#ifdef HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

View File

@ -2259,19 +2259,16 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
void InterpolateRow_NEON(uint8* dst_ptr, void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
int y1_fraction = source_y_fraction >> 1;
asm volatile ( asm volatile (
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
"cmp %4, #64 \n" "cmp %4, #64 \n"
"beq 75f \n"
"cmp %4, #128 \n"
"beq 50f \n" "beq 50f \n"
"cmp %4, #192 \n"
"beq 25f \n"
"vdup.8 d5, %4 \n" "vdup.8 d5, %4 \n"
"rsb %4, #256 \n" "rsb %4, #128 \n"
"vdup.8 d4, %4 \n" "vdup.8 d4, %4 \n"
// General purpose row blend. // General purpose row blend.
"1: \n" "1: \n"
@ -2284,27 +2281,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmull.u8 q14, d1, d4 \n" "vmull.u8 q14, d1, d4 \n"
"vmlal.u8 q13, d2, d5 \n" "vmlal.u8 q13, d2, d5 \n"
"vmlal.u8 q14, d3, d5 \n" "vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d0, q13, #7 \n"
"vrshrn.u16 d1, q14, #8 \n" "vrshrn.u16 d1, q14, #7 \n"
MEMACCESS(0) MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" "vst1.8 {q0}, [%0]! \n"
"bgt 1b \n" "bgt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1) MEMACCESS(1)
@ -2318,20 +2301,6 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"bgt 50b \n" "bgt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)
@ -2346,7 +2315,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"+r"(src_ptr), // %1 "+r"(src_ptr), // %1
"+r"(src_stride), // %2 "+r"(src_stride), // %2
"+r"(dst_width), // %3 "+r"(dst_width), // %3
"+r"(source_y_fraction) // %4 "+r"(y1_fraction) // %4
: :
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
); );

View File

@ -2336,18 +2336,14 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
void InterpolateRow_NEON(uint8* dst_ptr, void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) { int dst_width, int source_y_fraction) {
int y1_fraction = source_y_fraction; int y1_fraction = source_y_fraction >> 1;
int y0_fraction = 256 - y1_fraction; int y0_fraction = 128 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile (
"cmp %w4, #0 \n" "cmp %w4, #0 \n"
"b.eq 100f \n" "b.eq 100f \n"
"cmp %w4, #64 \n" "cmp %w4, #64 \n"
"b.eq 75f \n"
"cmp %w4, #128 \n"
"b.eq 50f \n" "b.eq 50f \n"
"cmp %w4, #192 \n"
"b.eq 25f \n"
"dup v5.16b, %w4 \n" "dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n" "dup v4.16b, %w5 \n"
@ -2362,27 +2358,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"umull2 v3.8h, v0.16b, v4.16b \n" "umull2 v3.8h, v0.16b, v4.16b \n"
"umlal v2.8h, v1.8b, v5.8b \n" "umlal v2.8h, v1.8b, v5.8b \n"
"umlal2 v3.8h, v1.16b, v5.16b \n" "umlal2 v3.8h, v1.16b, v5.16b \n"
"rshrn v0.8b, v2.8h, #8 \n" "rshrn v0.8b, v2.8h, #7 \n"
"rshrn2 v0.16b, v3.8h, #8 \n" "rshrn2 v0.16b, v3.8h, #7 \n"
MEMACCESS(0) MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" "st1 {v0.16b}, [%0], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
"b 99f \n" "b 99f \n"
// Blend 25 / 75.
"25: \n"
MEMACCESS(1)
"ld1 {v0.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v1.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 25b \n"
"b 99f \n"
// Blend 50 / 50. // Blend 50 / 50.
"50: \n" "50: \n"
MEMACCESS(1) MEMACCESS(1)
@ -2396,20 +2378,6 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b.gt 50b \n" "b.gt 50b \n"
"b 99f \n" "b 99f \n"
// Blend 75 / 25.
"75: \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
MEMACCESS(2)
"ld1 {v0.16b}, [%2], #16 \n"
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n"
"b.gt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
"100: \n" "100: \n"
MEMACCESS(1) MEMACCESS(1)

View File

@ -5571,12 +5571,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
cmp eax, 0 cmp eax, 0
je xloop100 // 0 / 128. Blend 100 / 0. je xloop100 // 0 / 128. Blend 100 / 0.
sub edi, esi sub edi, esi
cmp eax, 32
je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
cmp eax, 64 cmp eax, 64
je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
cmp eax, 96
je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
vmovd xmm0, eax // high fraction 0..127 vmovd xmm0, eax // high fraction 0..127
neg eax neg eax
@ -5587,6 +5583,10 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpxor ymm0, ymm0, ymm0 vpxor ymm0, ymm0, ymm0
vpermd ymm5, ymm0, ymm5 vpermd ymm5, ymm0, ymm5
mov eax, 0x00400040 // 64 for rounding.
vmovd xmm4, eax
vbroadcastss ymm4, xmm4
xloop: xloop:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
vmovdqu ymm2, [esi + edx] vmovdqu ymm2, [esi + edx]
@ -5594,6 +5594,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpunpcklbw ymm0, ymm0, ymm2 // mutates vpunpcklbw ymm0, ymm0, ymm2 // mutates
vpmaddubsw ymm0, ymm0, ymm5 vpmaddubsw ymm0, ymm0, ymm5
vpmaddubsw ymm1, ymm1, ymm5 vpmaddubsw ymm1, ymm1, ymm5
vpaddw ymm0, ymm0, ymm4
vpaddw ymm1, ymm1, ymm4
vpsrlw ymm0, ymm0, 7 vpsrlw ymm0, ymm0, 7
vpsrlw ymm1, ymm1, 7 vpsrlw ymm1, ymm1, 7
vpackuswb ymm0, ymm0, ymm1 // unmutates vpackuswb ymm0, ymm0, ymm1 // unmutates
@ -5603,18 +5605,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jg xloop jg xloop
jmp xloop99 jmp xloop99
// Blend 25 / 75.
xloop25:
vmovdqu ymm0, [esi]
vmovdqu ymm1, [esi + edx]
vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop25
jmp xloop99
// Blend 50 / 50. // Blend 50 / 50.
xloop50: xloop50:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
@ -5625,18 +5615,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
jg xloop50 jg xloop50
jmp xloop99 jmp xloop99
// Blend 75 / 25.
xloop75:
vmovdqu ymm1, [esi]
vmovdqu ymm0, [esi + edx]
vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
xloop100: xloop100:
rep movsb rep movsb
@ -5668,12 +5646,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
// Dispatch to specialized filters if applicable. // Dispatch to specialized filters if applicable.
cmp eax, 0 cmp eax, 0
je xloop100 // 0 / 128. Blend 100 / 0. je xloop100 // 0 / 128. Blend 100 / 0.
cmp eax, 32
je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
cmp eax, 64 cmp eax, 64
je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
cmp eax, 96
je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
movd xmm0, eax // high fraction 0..127 movd xmm0, eax // high fraction 0..127
neg eax neg eax
@ -5683,6 +5657,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5 punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0 pshufd xmm5, xmm5, 0
mov eax, 0x00400040 // 64 for rounding.
movd xmm4, eax
pshufd xmm4, xmm4, 0x00
xloop: xloop:
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm2, [esi + edx] movdqu xmm2, [esi + edx]
@ -5691,6 +5669,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpckhbw xmm1, xmm2 punpckhbw xmm1, xmm2
pmaddubsw xmm0, xmm5 pmaddubsw xmm0, xmm5
pmaddubsw xmm1, xmm5 pmaddubsw xmm1, xmm5
paddw xmm0, xmm4
paddw xmm1, xmm4
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm1, 7 psrlw xmm1, 7
packuswb xmm0, xmm1 packuswb xmm0, xmm1
@ -5700,18 +5680,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jg xloop jg xloop
jmp xloop99 jmp xloop99
// Blend 25 / 75.
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop25
jmp xloop99
// Blend 50 / 50. // Blend 50 / 50.
xloop50: xloop50:
movdqu xmm0, [esi] movdqu xmm0, [esi]
@ -5723,18 +5691,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jg xloop50 jg xloop50
jmp xloop99 jmp xloop99
// Blend 75 / 25.
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged. // Blend 100 / 0 - Copy row unchanged.
xloop100: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
@ -5750,114 +5706,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
} }
} }
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
__declspec(naked)
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) {
__asm {
push esi
push edi
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
// Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 / 256. Blend 100 / 0.
cmp eax, 64
je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
cmp eax, 128
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
cmp eax, 192
je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
movd xmm5, eax // xmm5 = y fraction
punpcklbw xmm5, xmm5
psrlw xmm5, 1
punpcklwd xmm5, xmm5
punpckldq xmm5, xmm5
punpcklqdq xmm5, xmm5
pxor xmm4, xmm4
xloop:
movdqu xmm0, [esi] // row0
movdqu xmm2, [esi + edx] // row1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
punpcklbw xmm2, xmm4
punpckhbw xmm3, xmm4
punpcklbw xmm0, xmm4
punpckhbw xmm1, xmm4
psubw xmm2, xmm0 // row1 - row0
psubw xmm3, xmm1
paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
paddw xmm3, xmm3
pmulhw xmm2, xmm5 // scale diff
pmulhw xmm3, xmm5
paddw xmm0, xmm2 // sum rows
paddw xmm1, xmm3
packuswb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop
jmp xloop99
// Blend 25 / 75.
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop25
jmp xloop99
// Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop50
jmp xloop99
// Blend 75 / 25.
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1
pavgb xmm0, xmm1
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop75
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
movdqu [esi + edi], xmm0
lea esi, [esi + 16]
sub ecx, 16
jg xloop100
xloop99:
pop edi
pop esi
ret
}
}
#endif // HAS_INTERPOLATEROW_SSE2
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
__declspec(naked) __declspec(naked)
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

View File

@ -875,14 +875,6 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -1072,14 +1064,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
&x, &y, &dx, &dy); &x, &y, &dx, &dy);
src_width = Abs(src_width); src_width = Abs(src_width);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;

View File

@ -210,14 +210,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
src_argb += xl * 4; src_argb += xl * 4;
x -= (int)(xl << 16); x -= (int)(xl << 16);
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(clip_src_width, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -308,14 +300,6 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int x, int dx) = int dst_width, int x, int dx) =
filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
const int max_y = (src_height - 1) << 16; const int max_y = (src_height - 1) << 16;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;
@ -494,14 +478,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
InterpolateRow_C; InterpolateRow_C;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width, 4)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;

View File

@ -876,14 +876,6 @@ void ScalePlaneVertical(int src_height,
assert(dst_width > 0); assert(dst_width > 0);
assert(dst_height > 0); assert(dst_height > 0);
src_argb += (x >> 16) * bpp; src_argb += (x >> 16) * bpp;
#if defined(HAS_INTERPOLATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
InterpolateRow = InterpolateRow_Any_SSE2;
if (IS_ALIGNED(dst_width_bytes, 16)) {
InterpolateRow = InterpolateRow_SSE2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3) #if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3; InterpolateRow = InterpolateRow_Any_SSSE3;

View File

@ -909,15 +909,15 @@ TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
EXPECT_EQ(0u, interpolate_pixels[1][0]); EXPECT_EQ(0u, interpolate_pixels[1][0]);
EXPECT_EQ(0u, interpolate_pixels[1][1]); EXPECT_EQ(0u, interpolate_pixels[1][1]);
EXPECT_EQ(0u, interpolate_pixels[1][2]); EXPECT_EQ(0u, interpolate_pixels[1][2]);
EXPECT_NEAR(128u, interpolate_pixels[1][3], 1); // C = 127, SSE = 128. EXPECT_EQ(128u, interpolate_pixels[1][3]);
EXPECT_EQ(0u, interpolate_pixels[2][0]); EXPECT_EQ(0u, interpolate_pixels[2][0]);
EXPECT_EQ(0u, interpolate_pixels[2][1]); EXPECT_EQ(0u, interpolate_pixels[2][1]);
EXPECT_EQ(0u, interpolate_pixels[2][2]); EXPECT_EQ(0u, interpolate_pixels[2][2]);
EXPECT_EQ(0u, interpolate_pixels[2][3]); EXPECT_EQ(0u, interpolate_pixels[2][3]);
EXPECT_NEAR(128u, interpolate_pixels[3][0], 1); EXPECT_EQ(128u, interpolate_pixels[3][0]);
EXPECT_NEAR(128u, interpolate_pixels[3][1], 1); EXPECT_EQ(128u, interpolate_pixels[3][1]);
EXPECT_NEAR(128u, interpolate_pixels[3][2], 1); EXPECT_EQ(128u, interpolate_pixels[3][2]);
EXPECT_NEAR(128u, interpolate_pixels[3][3], 1); EXPECT_EQ(128u, interpolate_pixels[3][3]);
ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
&interpolate_pixels[0][0], 0, 4, 1, 0); &interpolate_pixels[0][0], 0, 4, 1, 0);
@ -991,15 +991,15 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
EXPECT_EQ(0u, interpolate_pixels[4]); EXPECT_EQ(0u, interpolate_pixels[4]);
EXPECT_EQ(0u, interpolate_pixels[5]); EXPECT_EQ(0u, interpolate_pixels[5]);
EXPECT_EQ(0u, interpolate_pixels[6]); EXPECT_EQ(0u, interpolate_pixels[6]);
EXPECT_NEAR(128u, interpolate_pixels[7], 1); // C = 127, SSE = 128. EXPECT_EQ(128u, interpolate_pixels[7]);
EXPECT_EQ(0u, interpolate_pixels[8]); EXPECT_EQ(0u, interpolate_pixels[8]);
EXPECT_EQ(0u, interpolate_pixels[9]); EXPECT_EQ(0u, interpolate_pixels[9]);
EXPECT_EQ(0u, interpolate_pixels[10]); EXPECT_EQ(0u, interpolate_pixels[10]);
EXPECT_EQ(0u, interpolate_pixels[11]); EXPECT_EQ(0u, interpolate_pixels[11]);
EXPECT_NEAR(128u, interpolate_pixels[12], 1); EXPECT_EQ(128u, interpolate_pixels[12]);
EXPECT_NEAR(128u, interpolate_pixels[13], 1); EXPECT_EQ(128u, interpolate_pixels[13]);
EXPECT_NEAR(128u, interpolate_pixels[14], 1); EXPECT_EQ(128u, interpolate_pixels[14]);
EXPECT_NEAR(128u, interpolate_pixels[15], 1); EXPECT_EQ(128u, interpolate_pixels[15]);
InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
&interpolate_pixels[0], 0, 16, 1, 0); &interpolate_pixels[0], 0, 16, 1, 0);
@ -1013,12 +1013,12 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
EXPECT_EQ(4u, interpolate_pixels[0]); EXPECT_EQ(4u, interpolate_pixels[0]);
EXPECT_EQ(8u, interpolate_pixels[1]); EXPECT_EQ(8u, interpolate_pixels[1]);
EXPECT_EQ(16u,interpolate_pixels[2]); EXPECT_EQ(16u, interpolate_pixels[2]);
EXPECT_EQ(32u, interpolate_pixels[3]); EXPECT_EQ(32u, interpolate_pixels[3]);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
&interpolate_pixels[0], 0, 1280, 1, 128); &interpolate_pixels[0], 0, 1280, 1, 123);
} }
} }