diff --git a/README.chromium b/README.chromium index 4836a29c5..c0a3591e1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1765 +Version: 1766 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 158730ca8..96e1557ca 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb, int width, int height); +// Aliases +#define ABGRToRGB24 ARGBToRAW +#define ABGRToRAW ARGBToRGB24 + // Convert ARGB To RGB24. LIBYUV_API int ARGBToRGB24(const uint8_t* src_argb, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 776dc383d..9ebc10024 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -72,6 +72,13 @@ extern "C" { #define HAS_SCALEROWDOWN4_SSSE3 #endif +// The following are available for gcc/clang x86 platforms: +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_SCALEUVROWDOWN2BOX_SSSE3 +#endif + // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. @@ -98,6 +105,11 @@ extern "C" { #define HAS_SCALEROWDOWN4_NEON #endif +// The following are available on 64 bit Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SCALEUVROWDOWN2BOX_NEON +#endif + #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_SCALEADDROW_MSA #define HAS_SCALEARGBCOLS_MSA @@ -830,15 +842,15 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, int dst_width); // UV Row functions -void ScaleUVRowDown2_SSE2(const uint8_t* src_uv, +void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Linear_SSE2(const uint8_t* src_uv, +void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Box_SSE2(const uint8_t* src_uv, +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); @@ -846,7 +858,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleUVRowDown2Linear_NEON(const uint8_t* src_uv, +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); @@ -854,42 +866,42 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); -void ScaleUVRowDown2_MSA(const uint8_t* src_uv, +void ScaleUVRowDown2_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Linear_MSA(const uint8_t* src_uv, +void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Box_MSA(const uint8_t* src_uv, +void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2_MMI(const uint8_t* src_uv, +void ScaleUVRowDown2_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Linear_MMI(const uint8_t* src_uv, +void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2Box_MMI(const uint8_t* src_uv, +void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDown2_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); -void ScaleUVRowDown2Box_Any_SSE2(const uint8_t* src_ptr, - ptrdiff_t src_stride, - uint8_t* dst_ptr, - int dst_width); +void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -926,52 +938,52 @@ void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowDownEven_SSE2(const uint8_t* src_uv, +void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEvenBox_SSE2(const uint8_t* src_uv, +void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_NEON(const uint8_t* src_uv, +void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_uv, +void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_MSA(const uint8_t* src_uv, +void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_uv, +void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_MMI(const uint8_t* src_uv, +void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_uv, +void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); -void ScaleUVRowDownEven_Any_SSE2(const uint8_t* src_ptr, +void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); -void ScaleUVRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, +void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 381293c0b..47fdf8204 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1765 +#define LIBYUV_VERSION 1766 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/scale_any.cc b/source/scale_any.cc index d780cb1ff..5fca6ffb9 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -20,49 +20,6 @@ namespace libyuv { extern "C" { #endif -// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols -#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ - int dx) { \ - int r = dst_width & MASK; \ - int n = dst_width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ - } \ - TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ - } - -#ifdef HAS_SCALEFILTERCOLS_NEON -CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) -#endif -#ifdef HAS_SCALEFILTERCOLS_MSA -CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) -#endif -#ifdef HAS_SCALEARGBCOLS_NEON -CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) -#endif -#ifdef HAS_SCALEARGBCOLS_MSA -CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) -#endif -#ifdef HAS_SCALEARGBCOLS_MMI -CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_NEON -CANY(ScaleARGBFilterCols_Any_NEON, - ScaleARGBFilterCols_NEON, - ScaleARGBFilterCols_C, - 4, - 3) -#endif -#ifdef HAS_SCALEARGBFILTERCOLS_MSA -CANY(ScaleARGBFilterCols_Any_MSA, - ScaleARGBFilterCols_MSA, - ScaleARGBFilterCols_C, - 4, - 7) -#endif -#undef CANY - // Fixed scale down. // Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ @@ -113,6 +70,14 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +SDANY(ScaleUVRowDown2Box_Any_SSSE3, + ScaleUVRowDown2Box_SSSE3, + ScaleUVRowDown2Box_C, + 2, + 2, + 4) +#endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_AVX2, @@ -155,6 +120,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON, 1, 15) #endif +#ifdef HAS_SCALEUVROWDOWN2BOX_NEON +SDANY(ScaleUVRowDown2Box_Any_NEON, + ScaleUVRowDown2Box_NEON, + ScaleUVRowDown2Box_C, + 2, + 2, + 8) +#endif + #ifdef HAS_SCALEROWDOWN2_MSA SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_MSA, @@ -577,6 +551,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif // SASIMDONLY +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ + int dx) { \ + int r = dst_width & MASK; \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEFILTERCOLS_MSA +CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_MSA +CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBCOLS_MMI +CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, + ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, + 4, + 3) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_MSA +CANY(ScaleARGBFilterCols_Any_MSA, + ScaleARGBFilterCols_MSA, + ScaleARGBFilterCols_C, + 4, + 7) +#endif +#undef CANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_common.cc b/source/scale_common.cc index fd4cbd038..de3b22f27 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1063,11 +1063,9 @@ void ScaleUVRowDown2Box_C(const uint8_t* src_uv, int x; for (x = 0; x < dst_width; ++x) { dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + - src_uv[src_stride + 2] + 2) >> - 2; + src_uv[src_stride + 2] + 2) >> 2; dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + - src_uv[src_stride + 3] + 2) >> - 2; + src_uv[src_stride + 3] + 2) >> 2; src_uv += 4; dst_uv += 2; } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 90a49f30d..8806e1363 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -1366,6 +1366,52 @@ int FixedDiv1_X86(int num, int div) { return num; } +#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 +// Shuffle table for splitting UV into upper and lower part of register. +static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, + 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; +static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 6u, 14u, + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; + +void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5, %%xmm5 \n" // zero + "movdqa %4,%%xmm1 \n" // split shuffler + "movdqa %5,%%xmm3 \n" // merge shuffler + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // 8 UV row 0 + "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 + "lea 0x10(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv + "pshufb %%xmm1,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add + "pmaddubsw %%xmm4,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" // vertical add + "psrlw $0x1,%%xmm0 \n" // round + "pavgw %%xmm5,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" // merge uv + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" // 4 UV + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "m"(kShuffleSplitUV), // %4 + "m"(kShuffleMergeUV) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 366b155ba..b626fc298 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -950,6 +950,35 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, #undef LOAD2_DATA32_LANE +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. + "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. + "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV + "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV + "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes + "vrshrn.u16 d1, q1, #2 \n" + "vst2.8 {d0, d1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q8", "q9"); +} + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 7c7c33e25..c45b7abec 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1086,6 +1086,35 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. + "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 + "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "rshrn v0.8b, v0.8h, #2 \n" // round and pack + "prfm pldl1keep, [%1, 448] \n" + "rshrn v1.8b, v1.8h, #2 \n" + "st2 {v0.8b,v1.8b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v16", "v17"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 99742a6db..2b65dc4ee 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -73,22 +73,40 @@ static void ScaleUVDown2(int src_width, src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2; } -#if defined(HAS_SCALEUVROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_Any_SSE2 - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSE2 - : ScaleUVRowDown2Box_Any_SSE2); - if (IS_ALIGNED(dst_width, 2)) { - ScaleUVRowDown2 = - filtering == kFilterNone - ? ScaleUVRowDown2_SSE2 - : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSE2 - : ScaleUVRowDown2Box_SSE2); +#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif +// This code is not enabled. Only box filter is available at this time. +#if defined(HAS_SCALEUVROWDOWN2_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 + : ScaleUVRowDown2Box_Any_SSSE3); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_SSSE3 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 + : ScaleUVRowDown2Box_SSSE3); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2BOX_NEON) + if (TestCpuFlag(kCpuHasNEON) && filtering) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif +// This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDown2 = @@ -180,11 +198,11 @@ static void ScaleUVDown4Box(int src_width, (void)dx; assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. -#if defined(HAS_SCALEUVROWDOWN2_SSE2) +#if defined(HAS_SCALEUVROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSE2; + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleUVRowDown2 = ScaleUVRowDown2Box_SSE2; + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif @@ -237,13 +255,13 @@ static void ScaleUVDownEven(int src_width, assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_uv += (y >> 16) * src_stride + (x >> 16) * 2; -#if defined(HAS_SCALEUVROWDOWNEVEN_SSE2) +#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) if (TestCpuFlag(kCpuHasSSE2)) { - ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSE2 - : ScaleUVRowDownEven_Any_SSE2; + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 + : ScaleUVRowDownEven_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDownEven = - filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSE2; + filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; } } #endif @@ -494,9 +512,9 @@ static void ScaleUVBilinearUp(int src_width, } } #endif -#if defined(HAS_SCALEUVCOLS_SSE2) +#if defined(HAS_SCALEUVCOLS_SSSE3) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleUVFilterCols = ScaleUVCols_SSE2; + ScaleUVFilterCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -525,9 +543,9 @@ static void ScaleUVBilinearUp(int src_width, #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleUVFilterCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSE2) +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleUVFilterCols = ScaleUVColsUp2_SSE2; + ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; } #endif #if defined(HAS_SCALEUVCOLSUP2_MMI) @@ -612,9 +630,9 @@ static void ScaleUVSimple(int src_width, int x, int dx) = (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; (void)src_height; -#if defined(HAS_SCALEUVCOLS_SSE2) +#if defined(HAS_SCALEUVCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { - ScaleUVCols = ScaleUVCols_SSE2; + ScaleUVCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) @@ -643,9 +661,9 @@ static void ScaleUVSimple(int src_width, #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleUVCols = ScaleUVColsUp2_C; -#if defined(HAS_SCALEUVCOLSUP2_SSE2) +#if defined(HAS_SCALEUVCOLSUP2_SSSE3) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleUVCols = ScaleUVColsUp2_SSE2; + ScaleUVCols = ScaleUVColsUp2_SSSE3; } #endif #if defined(HAS_SCALEUVCOLSUP2_MMI) diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 021abb6f8..00e3820cf 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1114,6 +1114,8 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1) TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1) TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1) TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1) +TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1) +TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc index c8b57d174..132e4a602 100644 --- a/unit_test/scale_uv_test.cc +++ b/unit_test/scale_uv_test.cc @@ -14,7 +14,6 @@ #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale_uv.h" -#include "libyuv/video_common.h" namespace libyuv { @@ -23,13 +22,13 @@ namespace libyuv { // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int UVTestFilter(int src_width, - int src_height, - int dst_width, - int dst_height, - FilterMode f, - int benchmark_iterations, - int disable_cpu_flags, - int benchmark_cpu_info) { + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } @@ -47,7 +46,8 @@ static int UVTestFilter(int src_width, } MemRandomize(src_uv, src_uv_plane_size); - int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL; + int64_t dst_uv_plane_size = + (dst_width + b * 2) * (dst_height + b * 2) * 2LL; int dst_stride_uv = (b * 2 + dst_width) * 2; align_buffer_page_end(dst_uv_c, dst_uv_plane_size); @@ -61,28 +61,29 @@ static int UVTestFilter(int src_width, // Warm up both versions for consistent benchmarks. MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, + src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, + dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, + src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, + dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, + src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, + dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { - UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, - src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, - dst_width, dst_height, f); + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, + src_width, src_height, + dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, + dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; @@ -111,22 +112,56 @@ static int UVTestFilter(int src_width, return max_diff; } -#define TEST_SCALETO1(name, width, height, filter, max_diff) \ - TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ - int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \ - height, kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ - int diff = UVTestFilter(width, height, Abs(benchmark_width_), \ - Abs(benchmark_height_), kFilter##filter, \ - benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ +// The following adjustments in dimensions ensure the scale factor will be +// exactly achieved. +#define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom) +#define SX(x, nom, denom) static_cast((x / nom) * denom) + +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \ + int diff = UVTestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } -/// Test scale to a specified size with all 3 filters. +// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but +// filtering is different fixed point implementations for SSSE3, Neon and C. +#define TEST_FACTOR(name, nom, denom) \ + TEST_FACTOR1(name, None, nom, denom, 0) \ + TEST_FACTOR1(name, Linear, nom, denom, 3) \ + TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(name, Box, nom, denom, 3) + +TEST_FACTOR(2, 1, 2) +TEST_FACTOR(4, 1, 4) +// TEST_FACTOR(8, 1, 8) Disable for benchmark performance. +TEST_FACTOR(3by4, 3, 4) +TEST_FACTOR(3by8, 3, 8) +TEST_FACTOR(3, 1, 3) +#undef TEST_FACTOR1 +#undef TEST_FACTOR +#undef SX +#undef DX + +#define TEST_SCALETO1(name, width, height, filter, max_diff) \ + TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ + int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ + int diff = UVTestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +/// Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, None, 0) \ TEST_SCALETO1(name, width, height, Linear, 3) \