mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
2x down sample for UV planes ported to SSSE3 / NEON
Bug: libuyv:838 Change-Id: Id9fb3282a3e86143d76b5e0cb557f0523a88b3c8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2465578 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
b6f3cff282
commit
d730dc2f18
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1765
|
||||
Version: 1766
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Aliases
|
||||
#define ABGRToRGB24 ARGBToRAW
|
||||
#define ABGRToRAW ARGBToRGB24
|
||||
|
||||
// Convert ARGB To RGB24.
|
||||
LIBYUV_API
|
||||
int ARGBToRGB24(const uint8_t* src_argb,
|
||||
|
||||
@ -72,6 +72,13 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN4_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for gcc/clang x86 platforms:
|
||||
// TODO(fbarchard): Port to Visual C
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
// require VS2012, clang 3.4 or gcc 4.7.
|
||||
// The code supports NaCL but requires a new compiler and validator.
|
||||
@ -98,6 +105,11 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on 64 bit Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_SCALEADDROW_MSA
|
||||
#define HAS_SCALEARGBCOLS_MSA
|
||||
@ -830,15 +842,15 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
|
||||
int dst_width);
|
||||
|
||||
// UV Row functions
|
||||
void ScaleUVRowDown2_SSE2(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Linear_SSE2(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_SSE2(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
@ -846,7 +858,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
@ -854,39 +866,39 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2_MSA(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Linear_MSA(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_MSA(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2_MMI(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Linear_MMI(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_MMI(const uint8_t* src_uv,
|
||||
void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2_Any_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
@ -926,52 +938,52 @@ void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEven_SSE2(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEvenBox_SSE2(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEven_NEON(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEven_MSA(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEven_MMI(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int32_t src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_uv,
|
||||
void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEven_Any_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int src_stepx,
|
||||
uint8_t* dst_ptr,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1765
|
||||
#define LIBYUV_VERSION 1766
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -20,49 +20,6 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
|
||||
int dx) { \
|
||||
int r = dst_width & MASK; \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEFILTERCOLS_MSA
|
||||
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_MSA
|
||||
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_MMI
|
||||
CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON,
|
||||
ScaleARGBFilterCols_NEON,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
|
||||
CANY(ScaleARGBFilterCols_Any_MSA,
|
||||
ScaleARGBFilterCols_MSA,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Fixed scale down.
|
||||
// Mask may be non-power of 2, so use MOD
|
||||
#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \
|
||||
@ -113,6 +70,14 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
SDANY(ScaleUVRowDown2Box_Any_SSSE3,
|
||||
ScaleUVRowDown2Box_SSSE3,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
4)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_AVX2
|
||||
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_AVX2,
|
||||
@ -155,6 +120,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
SDANY(ScaleUVRowDown2Box_Any_NEON,
|
||||
ScaleUVRowDown2Box_NEON,
|
||||
ScaleUVRowDown2Box_C,
|
||||
2,
|
||||
2,
|
||||
8)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWDOWN2_MSA
|
||||
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_MSA,
|
||||
@ -577,6 +551,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
|
||||
|
||||
#endif // SASIMDONLY
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
|
||||
int dx) { \
|
||||
int r = dst_width & MASK; \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEFILTERCOLS_MSA
|
||||
CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_MSA
|
||||
CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_MMI
|
||||
CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_NEON
|
||||
CANY(ScaleARGBFilterCols_Any_NEON,
|
||||
ScaleARGBFilterCols_NEON,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
3)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_MSA
|
||||
CANY(ScaleARGBFilterCols_Any_MSA,
|
||||
ScaleARGBFilterCols_MSA,
|
||||
ScaleARGBFilterCols_C,
|
||||
4,
|
||||
7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -1063,11 +1063,9 @@ void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
|
||||
int x;
|
||||
for (x = 0; x < dst_width; ++x) {
|
||||
dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
|
||||
src_uv[src_stride + 2] + 2) >>
|
||||
2;
|
||||
src_uv[src_stride + 2] + 2) >> 2;
|
||||
dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
|
||||
src_uv[src_stride + 3] + 2) >>
|
||||
2;
|
||||
src_uv[src_stride + 3] + 2) >> 2;
|
||||
src_uv += 4;
|
||||
dst_uv += 2;
|
||||
}
|
||||
|
||||
@ -1366,6 +1366,52 @@ int FixedDiv1_X86(int num, int div) {
|
||||
return num;
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
// Shuffle table for splitting UV into upper and lower part of register.
|
||||
static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
|
||||
1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
|
||||
static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 6u, 14u,
|
||||
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
||||
|
||||
void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5, %%xmm5 \n" // zero
|
||||
"movdqa %4,%%xmm1 \n" // split shuffler
|
||||
"movdqa %5,%%xmm3 \n" // merge shuffler
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // 8 UV row 0
|
||||
"movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv
|
||||
"pshufb %%xmm1,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n" // vertical add
|
||||
"psrlw $0x1,%%xmm0 \n" // round
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pshufb %%xmm3,%%xmm0 \n" // merge uv
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"lea 0x8(%1),%1 \n" // 4 UV
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"m"(kShuffleSplitUV), // %4
|
||||
"m"(kShuffleMergeUV) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -950,6 +950,35 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
|
||||
|
||||
#undef LOAD2_DATA32_LANE
|
||||
|
||||
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||
"vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts.
|
||||
"vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV
|
||||
"vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV
|
||||
"vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts.
|
||||
"vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes
|
||||
"vrshrn.u16 d1, q1, #2 \n"
|
||||
"vst2.8 {d0, d1}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q8", "q9");
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1086,6 +1086,35 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16
|
||||
"uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"rshrn v1.8b, v1.8h, #2 \n"
|
||||
"st2 {v0.8b,v1.8b}, [%2], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v16", "v17");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -73,22 +73,40 @@ static void ScaleUVDown2(int src_width,
|
||||
src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
|
||||
}
|
||||
|
||||
#if defined(HAS_SCALEUVROWDOWN2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_Any_SSE2
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSE2
|
||||
: ScaleUVRowDown2Box_Any_SSE2);
|
||||
if (IS_ALIGNED(dst_width, 2)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_SSE2
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSE2
|
||||
: ScaleUVRowDown2Box_SSE2);
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// This code is not enabled. Only box filter is available at this time.
|
||||
#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_Any_SSSE3
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
|
||||
: ScaleUVRowDown2Box_Any_SSSE3);
|
||||
if (IS_ALIGNED(dst_width, 2)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_SSSE3
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
|
||||
: ScaleUVRowDown2Box_SSSE3);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && filtering) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// This code is not enabled. Only box filter is available at this time.
|
||||
#if defined(HAS_SCALEUVROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleUVRowDown2 =
|
||||
@ -180,11 +198,11 @@ static void ScaleUVDown4Box(int src_width,
|
||||
(void)dx;
|
||||
assert(dx == 65536 * 4); // Test scale factor of 4.
|
||||
assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
|
||||
#if defined(HAS_SCALEUVROWDOWN2_SSE2)
|
||||
#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSE2;
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSE2;
|
||||
ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -237,13 +255,13 @@ static void ScaleUVDownEven(int src_width,
|
||||
assert(IS_ALIGNED(src_width, 2));
|
||||
assert(IS_ALIGNED(src_height, 2));
|
||||
src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
|
||||
#if defined(HAS_SCALEUVROWDOWNEVEN_SSE2)
|
||||
#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSE2
|
||||
: ScaleUVRowDownEven_Any_SSE2;
|
||||
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
|
||||
: ScaleUVRowDownEven_Any_SSSE3;
|
||||
if (IS_ALIGNED(dst_width, 4)) {
|
||||
ScaleUVRowDownEven =
|
||||
filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSE2;
|
||||
filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -494,9 +512,9 @@ static void ScaleUVBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVCOLS_SSE2)
|
||||
#if defined(HAS_SCALEUVCOLS_SSSE3)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleUVFilterCols = ScaleUVCols_SSE2;
|
||||
ScaleUVFilterCols = ScaleUVCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVCOLS_NEON)
|
||||
@ -525,9 +543,9 @@ static void ScaleUVBilinearUp(int src_width,
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleUVFilterCols = ScaleUVColsUp2_C;
|
||||
#if defined(HAS_SCALEUVCOLSUP2_SSE2)
|
||||
#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVFilterCols = ScaleUVColsUp2_SSE2;
|
||||
ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVCOLSUP2_MMI)
|
||||
@ -612,9 +630,9 @@ static void ScaleUVSimple(int src_width,
|
||||
int x, int dx) =
|
||||
(src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
|
||||
(void)src_height;
|
||||
#if defined(HAS_SCALEUVCOLS_SSE2)
|
||||
#if defined(HAS_SCALEUVCOLS_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleUVCols = ScaleUVCols_SSE2;
|
||||
ScaleUVCols = ScaleUVCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVCOLS_NEON)
|
||||
@ -643,9 +661,9 @@ static void ScaleUVSimple(int src_width,
|
||||
#endif
|
||||
if (src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleUVCols = ScaleUVColsUp2_C;
|
||||
#if defined(HAS_SCALEUVCOLSUP2_SSE2)
|
||||
#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVCols = ScaleUVColsUp2_SSE2;
|
||||
ScaleUVCols = ScaleUVColsUp2_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVCOLSUP2_MMI)
|
||||
|
||||
@ -1114,6 +1114,8 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
|
||||
TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
|
||||
TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
|
||||
TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
|
||||
TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
|
||||
TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
|
||||
#ifdef LITTLE_ENDIAN_ONLY_TEST
|
||||
TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
|
||||
#endif
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
#include "../unit_test/unit_test.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/scale_uv.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
@ -47,7 +46,8 @@ static int UVTestFilter(int src_width,
|
||||
}
|
||||
MemRandomize(src_uv, src_uv_plane_size);
|
||||
|
||||
int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
|
||||
int64_t dst_uv_plane_size =
|
||||
(dst_width + b * 2) * (dst_height + b * 2) * 2LL;
|
||||
int dst_stride_uv = (b * 2 + dst_width) * 2;
|
||||
|
||||
align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
|
||||
@ -61,27 +61,28 @@ static int UVTestFilter(int src_width,
|
||||
|
||||
// Warm up both versions for consistent benchmarks.
|
||||
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
|
||||
src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
|
||||
dst_stride_uv, dst_width, dst_height, f);
|
||||
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
|
||||
src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2,
|
||||
dst_stride_uv, dst_width, dst_height, f);
|
||||
|
||||
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
|
||||
double c_time = get_time();
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
|
||||
src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
|
||||
dst_stride_uv, dst_width, dst_height, f);
|
||||
|
||||
c_time = (get_time() - c_time);
|
||||
|
||||
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
|
||||
double opt_time = get_time();
|
||||
for (i = 0; i < benchmark_iterations; ++i) {
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
|
||||
src_width, src_height,
|
||||
dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
}
|
||||
opt_time = (get_time() - opt_time) / benchmark_iterations;
|
||||
@ -111,6 +112,40 @@ static int UVTestFilter(int src_width,
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
// The following adjustments in dimensions ensure the scale factor will be
|
||||
// exactly achieved.
|
||||
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
|
||||
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
|
||||
|
||||
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
|
||||
TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \
|
||||
int diff = UVTestFilter( \
|
||||
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
|
||||
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
|
||||
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
|
||||
benchmark_cpu_info_); \
|
||||
EXPECT_LE(diff, max_diff); \
|
||||
}
|
||||
|
||||
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
|
||||
// filtering is different fixed point implementations for SSSE3, Neon and C.
|
||||
#define TEST_FACTOR(name, nom, denom) \
|
||||
TEST_FACTOR1(name, None, nom, denom, 0) \
|
||||
TEST_FACTOR1(name, Linear, nom, denom, 3) \
|
||||
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
|
||||
TEST_FACTOR1(name, Box, nom, denom, 3)
|
||||
|
||||
TEST_FACTOR(2, 1, 2)
|
||||
TEST_FACTOR(4, 1, 4)
|
||||
// TEST_FACTOR(8, 1, 8) Disable for benchmark performance.
|
||||
TEST_FACTOR(3by4, 3, 4)
|
||||
TEST_FACTOR(3by8, 3, 8)
|
||||
TEST_FACTOR(3, 1, 3)
|
||||
#undef TEST_FACTOR1
|
||||
#undef TEST_FACTOR
|
||||
#undef SX
|
||||
#undef DX
|
||||
|
||||
#define TEST_SCALETO1(name, width, height, filter, max_diff) \
|
||||
TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \
|
||||
int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \
|
||||
@ -126,7 +161,7 @@ static int UVTestFilter(int src_width,
|
||||
EXPECT_LE(diff, max_diff); \
|
||||
}
|
||||
|
||||
/// Test scale to a specified size with all 3 filters.
|
||||
/// Test scale to a specified size with all 4 filters.
|
||||
#define TEST_SCALETO(name, width, height) \
|
||||
TEST_SCALETO1(name, width, height, None, 0) \
|
||||
TEST_SCALETO1(name, width, height, Linear, 3) \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user