2x down sample for UV planes ported to SSSE3 / NEON

Bug: libuyv:838
Change-Id: Id9fb3282a3e86143d76b5e0cb557f0523a88b3c8
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2465578
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-10-12 16:06:09 -07:00 committed by Commit Bot
parent b6f3cff282
commit d730dc2f18
12 changed files with 334 additions and 144 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1765
Version: 1766
License: BSD
License File: LICENSE

View File

@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
int width,
int height);
// Aliases
#define ABGRToRGB24 ARGBToRAW
#define ABGRToRAW ARGBToRGB24
// Convert ARGB To RGB24.
LIBYUV_API
int ARGBToRGB24(const uint8_t* src_argb,

View File

@ -72,6 +72,13 @@ extern "C" {
#define HAS_SCALEROWDOWN4_SSSE3
#endif
// The following are available for gcc/clang x86 platforms:
// TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
#endif
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
// The code supports NaCL but requires a new compiler and validator.
@ -98,6 +105,11 @@ extern "C" {
#define HAS_SCALEROWDOWN4_NEON
#endif
// The following are available on 64 bit Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALEUVROWDOWN2BOX_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_SCALEADDROW_MSA
#define HAS_SCALEARGBCOLS_MSA
@ -830,15 +842,15 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
int dst_width);
// UV Row functions
void ScaleUVRowDown2_SSE2(const uint8_t* src_uv,
void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Linear_SSE2(const uint8_t* src_uv,
void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Box_SSE2(const uint8_t* src_uv,
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
@ -846,7 +858,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_uv,
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
@ -854,42 +866,42 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleUVRowDown2_MSA(const uint8_t* src_uv,
void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Linear_MSA(const uint8_t* src_uv,
void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Box_MSA(const uint8_t* src_uv,
void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2_MMI(const uint8_t* src_uv,
void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Linear_MMI(const uint8_t* src_uv,
void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2Box_MMI(const uint8_t* src_uv,
void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDown2_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@ -926,52 +938,52 @@ void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDownEven_SSE2(const uint8_t* src_uv,
void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEvenBox_SSE2(const uint8_t* src_uv,
void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEven_NEON(const uint8_t* src_uv,
void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_uv,
void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEven_MSA(const uint8_t* src_uv,
void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_uv,
void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEven_MMI(const uint8_t* src_uv,
void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int32_t src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_uv,
void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowDownEven_Any_SSE2(const uint8_t* src_ptr,
void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
int src_stepx,
uint8_t* dst_ptr,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1765
#define LIBYUV_VERSION 1766
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -20,49 +20,6 @@ namespace libyuv {
extern "C" {
#endif
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
int dx) { \
int r = dst_width & MASK; \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEFILTERCOLS_MSA
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_MSA
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBCOLS_MMI
CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C,
4,
3)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
CANY(ScaleARGBFilterCols_Any_MSA,
ScaleARGBFilterCols_MSA,
ScaleARGBFilterCols_C,
4,
7)
#endif
#undef CANY
// Fixed scale down.
// Mask may be non-power of 2, so use MOD
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
@ -113,6 +70,14 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
1,
15)
#endif
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
SDANY(ScaleUVRowDown2Box_Any_SSSE3,
ScaleUVRowDown2Box_SSSE3,
ScaleUVRowDown2Box_C,
2,
2,
4)
#endif
#ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2,
@ -155,6 +120,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
1,
15)
#endif
#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
SDANY(ScaleUVRowDown2Box_Any_NEON,
ScaleUVRowDown2Box_NEON,
ScaleUVRowDown2Box_C,
2,
2,
8)
#endif
#ifdef HAS_SCALEROWDOWN2_MSA
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_MSA,
@ -577,6 +551,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
#endif // SASIMDONLY
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
int dx) { \
int r = dst_width & MASK; \
int n = dst_width & ~MASK; \
if (n > 0) { \
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
} \
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
}
#ifdef HAS_SCALEFILTERCOLS_NEON
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
#endif
#ifdef HAS_SCALEFILTERCOLS_MSA
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
#endif
#ifdef HAS_SCALEARGBCOLS_NEON
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
#endif
#ifdef HAS_SCALEARGBCOLS_MSA
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
#endif
#ifdef HAS_SCALEARGBCOLS_MMI
CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
CANY(ScaleARGBFilterCols_Any_NEON,
ScaleARGBFilterCols_NEON,
ScaleARGBFilterCols_C,
4,
3)
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
CANY(ScaleARGBFilterCols_Any_MSA,
ScaleARGBFilterCols_MSA,
ScaleARGBFilterCols_C,
4,
7)
#endif
#undef CANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -1063,11 +1063,9 @@ void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
int x;
for (x = 0; x < dst_width; ++x) {
dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
src_uv[src_stride + 2] + 2) >>
2;
src_uv[src_stride + 2] + 2) >> 2;
dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
src_uv[src_stride + 3] + 2) >>
2;
src_uv[src_stride + 3] + 2) >> 2;
src_uv += 4;
dst_uv += 2;
}

View File

@ -1366,6 +1366,52 @@ int FixedDiv1_X86(int num, int div) {
return num;
}
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
// Shuffle table for splitting UV into upper and lower part of register.
static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 6u, 14u,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5, %%xmm5 \n" // zero
"movdqa %4,%%xmm1 \n" // split shuffler
"movdqa %5,%%xmm3 \n" // merge shuffler
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 8 UV row 0
"movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
"lea 0x10(%0),%0 \n"
"pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
"pshufb %%xmm1,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
"pmaddubsw %%xmm4,%%xmm2 \n"
"paddw %%xmm2,%%xmm0 \n" // vertical add
"psrlw $0x1,%%xmm0 \n" // round
"pavgw %%xmm5,%%xmm0 \n"
"pshufb %%xmm3,%%xmm0 \n" // merge uv
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n" // 4 UV
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@ -950,6 +950,35 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
#undef LOAD2_DATA32_LANE
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
"vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
"vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
"vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
"vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
"vrshrn.u16 d1, q1, #2 \n"
"vst2.8 {d0, d1}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "q0", "q1", "q8", "q9");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus

View File

@ -1086,6 +1086,35 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
);
}
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
"ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
"uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
"uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"prfm pldl1keep, [%1, 448] \n"
"rshrn v1.8b, v1.8h, #2 \n"
"st2 {v0.8b,v1.8b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
"+r"(dst), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v16", "v17");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -73,22 +73,40 @@ static void ScaleUVDown2(int src_width,
src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
}
#if defined(HAS_SCALEUVROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleUVRowDown2 =
filtering == kFilterNone
? ScaleUVRowDown2_Any_SSE2
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSE2
: ScaleUVRowDown2Box_Any_SSE2);
if (IS_ALIGNED(dst_width, 2)) {
ScaleUVRowDown2 =
filtering == kFilterNone
? ScaleUVRowDown2_SSE2
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSE2
: ScaleUVRowDown2Box_SSE2);
#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
}
}
#endif
// This code is not enabled. Only box filter is available at this time.
#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleUVRowDown2 =
filtering == kFilterNone
? ScaleUVRowDown2_Any_SSSE3
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
: ScaleUVRowDown2Box_Any_SSSE3);
if (IS_ALIGNED(dst_width, 2)) {
ScaleUVRowDown2 =
filtering == kFilterNone
? ScaleUVRowDown2_SSSE3
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
: ScaleUVRowDown2Box_SSSE3);
}
}
#endif
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
if (TestCpuFlag(kCpuHasNEON) && filtering) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
if (IS_ALIGNED(dst_width, 8)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
}
}
#endif
// This code is not enabled. Only box filter is available at this time.
#if defined(HAS_SCALEUVROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ScaleUVRowDown2 =
@ -180,11 +198,11 @@ static void ScaleUVDown4Box(int src_width,
(void)dx;
assert(dx == 65536 * 4); // Test scale factor of 4.
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
#if defined(HAS_SCALEUVROWDOWN2_SSE2)
#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSE2;
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSE2;
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
}
}
#endif
@ -237,13 +255,13 @@ static void ScaleUVDownEven(int src_width,
assert(IS_ALIGNED(src_width, 2));
assert(IS_ALIGNED(src_height, 2));
src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
#if defined(HAS_SCALEUVROWDOWNEVEN_SSE2)
#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSE2
: ScaleUVRowDownEven_Any_SSE2;
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
: ScaleUVRowDownEven_Any_SSSE3;
if (IS_ALIGNED(dst_width, 4)) {
ScaleUVRowDownEven =
filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSE2;
filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
}
}
#endif
@ -494,9 +512,9 @@ static void ScaleUVBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_SCALEUVCOLS_SSE2)
#if defined(HAS_SCALEUVCOLS_SSSE3)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleUVFilterCols = ScaleUVCols_SSE2;
ScaleUVFilterCols = ScaleUVCols_SSSE3;
}
#endif
#if defined(HAS_SCALEUVCOLS_NEON)
@ -525,9 +543,9 @@ static void ScaleUVBilinearUp(int src_width,
#endif
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
ScaleUVFilterCols = ScaleUVColsUp2_C;
#if defined(HAS_SCALEUVCOLSUP2_SSE2)
#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleUVFilterCols = ScaleUVColsUp2_SSE2;
ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
}
#endif
#if defined(HAS_SCALEUVCOLSUP2_MMI)
@ -612,9 +630,9 @@ static void ScaleUVSimple(int src_width,
int x, int dx) =
(src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
(void)src_height;
#if defined(HAS_SCALEUVCOLS_SSE2)
#if defined(HAS_SCALEUVCOLS_SSSE3)
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleUVCols = ScaleUVCols_SSE2;
ScaleUVCols = ScaleUVCols_SSSE3;
}
#endif
#if defined(HAS_SCALEUVCOLS_NEON)
@ -643,9 +661,9 @@ static void ScaleUVSimple(int src_width,
#endif
if (src_width * 2 == dst_width && x < 0x8000) {
ScaleUVCols = ScaleUVColsUp2_C;
#if defined(HAS_SCALEUVCOLSUP2_SSE2)
#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
ScaleUVCols = ScaleUVColsUp2_SSE2;
ScaleUVCols = ScaleUVColsUp2_SSSE3;
}
#endif
#if defined(HAS_SCALEUVCOLSUP2_MMI)

View File

@ -1114,6 +1114,8 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
#ifdef LITTLE_ENDIAN_ONLY_TEST
TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
#endif

View File

@ -14,7 +14,6 @@
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale_uv.h"
#include "libyuv/video_common.h"
namespace libyuv {
@ -23,13 +22,13 @@ namespace libyuv {
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int UVTestFilter(int src_width,
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
int src_height,
int dst_width,
int dst_height,
FilterMode f,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info) {
if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
return 0;
}
@ -47,7 +46,8 @@ static int UVTestFilter(int src_width,
}
MemRandomize(src_uv, src_uv_plane_size);
int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
int64_t dst_uv_plane_size =
(dst_width + b * 2) * (dst_height + b * 2) * 2LL;
int dst_stride_uv = (b * 2 + dst_width) * 2;
align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
@ -61,28 +61,29 @@ static int UVTestFilter(int src_width,
// Warm up both versions for consistent benchmarks.
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
dst_width, dst_height, f);
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
dst_stride_uv, dst_width, dst_height, f);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
dst_width, dst_height, f);
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2,
dst_stride_uv, dst_width, dst_height, f);
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
double c_time = get_time();
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
dst_width, dst_height, f);
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
dst_stride_uv, dst_width, dst_height, f);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
double opt_time = get_time();
for (i = 0; i < benchmark_iterations; ++i) {
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
dst_width, dst_height, f);
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
src_width, src_height,
dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
dst_width, dst_height, f);
}
opt_time = (get_time() - opt_time) / benchmark_iterations;
@ -111,22 +112,56 @@ static int UVTestFilter(int src_width,
return max_diff;
}
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = UVTestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
// The following adjustments in dimensions ensure the scale factor will be
// exactly achieved.
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \
int diff = UVTestFilter( \
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
}
/// Test scale to a specified size with all 3 filters.
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
#define TEST_FACTOR(name, nom, denom) \
TEST_FACTOR1(name, None, nom, denom, 0) \
TEST_FACTOR1(name, Linear, nom, denom, 3) \
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
TEST_FACTOR1(name, Box, nom, denom, 3)
TEST_FACTOR(2, 1, 2)
TEST_FACTOR(4, 1, 4)
// TEST_FACTOR(8, 1, 8) Disable for benchmark performance.
TEST_FACTOR(3by4, 3, 4)
TEST_FACTOR(3by8, 3, 8)
TEST_FACTOR(3, 1, 3)
#undef TEST_FACTOR1
#undef TEST_FACTOR
#undef SX
#undef DX
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \
height, kFilter##filter, benchmark_iterations_, \
disable_cpu_flags_, benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \
int diff = UVTestFilter(width, height, Abs(benchmark_width_), \
Abs(benchmark_height_), kFilter##filter, \
benchmark_iterations_, disable_cpu_flags_, \
benchmark_cpu_info_); \
EXPECT_LE(diff, max_diff); \
}
/// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(name, width, height, None, 0) \
TEST_SCALETO1(name, width, height, Linear, 3) \