From c41eabe3d4e1c30f8cb1c5f8660583bf168d426a Mon Sep 17 00:00:00 2001 From: Yuan Tong Date: Tue, 2 Mar 2021 23:41:07 +0800 Subject: [PATCH] Add full 16 bit scaling up by 2x function R=fbarchard@chromium.org Change-Id: I4a869aefdc16e34357a615727711594c5d8e3a80 Bug: libyuv:882 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2719842 Reviewed-by: Frank Barchard --- include/libyuv/scale_row.h | 64 ++++++- source/scale.cc | 131 ++++++++++++-- source/scale_any.cc | 72 ++++++-- source/scale_gcc.cc | 343 +++++++++++++++++++++++++++++++++---- source/scale_neon.cc | 102 ++++++++++- source/scale_neon64.cc | 104 ++++++++++- unit_test/scale_test.cc | 278 ++++++++++++++++++++++++++++++ 7 files changed, 1029 insertions(+), 65 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 18ffb546a..9ad51a562 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -81,8 +81,10 @@ extern "C" { #define HAS_SCALEROWUP2LINEAR_SSSE3 #define HAS_SCALEROWUP2BILINEAR_SSE2 #define HAS_SCALEROWUP2BILINEAR_SSSE3 -#define HAS_SCALEROWUP2LINEAR_16_SSSE3 -#define HAS_SCALEROWUP2BILINEAR_16_SSSE3 +#define HAS_SCALEROWUP2LINEAR_12_SSSE3 +#define HAS_SCALEROWUP2BILINEAR_12_SSSE3 +#define HAS_SCALEROWUP2LINEAR_16_SSE2 +#define HAS_SCALEROWUP2BILINEAR_16_SSE2 #define HAS_SCALEUVROWUP2LINEAR_SSSE3 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3 #define HAS_SCALEUVROWUP2LINEAR_16_SSE2 @@ -98,6 +100,8 @@ extern "C" { #define HAS_SCALEUVROWDOWN2BOX_AVX2 #define HAS_SCALEROWUP2LINEAR_AVX2 #define HAS_SCALEROWUP2BILINEAR_AVX2 +#define HAS_SCALEROWUP2LINEAR_12_AVX2 +#define HAS_SCALEROWUP2BILINEAR_12_AVX2 #define HAS_SCALEROWUP2LINEAR_16_AVX2 #define HAS_SCALEROWUP2BILINEAR_16_AVX2 #define HAS_SCALEUVROWUP2LINEAR_AVX2 @@ -134,6 +138,8 @@ extern "C" { #define HAS_SCALEUVROWDOWNEVEN_NEON #define HAS_SCALEROWUP2LINEAR_NEON #define HAS_SCALEROWUP2BILINEAR_NEON +#define HAS_SCALEROWUP2LINEAR_12_NEON +#define HAS_SCALEROWUP2BILINEAR_12_NEON #define HAS_SCALEROWUP2LINEAR_16_NEON #define HAS_SCALEROWUP2BILINEAR_16_NEON #define HAS_SCALEUVROWUP2LINEAR_NEON @@ -611,10 +617,18 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); -void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -635,6 +649,14 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -651,7 +673,15 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, @@ -675,6 +705,14 @@ void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -1424,6 +1462,14 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); @@ -1440,6 +1486,14 @@ void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); diff --git a/source/scale.cc b/source/scale.cc index 4a5dc94aa..3ccd2111b 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1459,6 +1459,107 @@ void ScalePlaneUp2_Bilinear(int src_width, // its original width, using linear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I210 to I410 and I212 to I412. +void ScalePlaneUp2_12_Linear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, + int dst_width) = ScaleRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2LINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width); + dst_ptr += dst_stride; + y += dy; + } + } +} + +// Scale at most 12 bit plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// stride is in count of uint16_t. +// This is used to scale U and V planes of I010 to I410 and I012 to I412. +void ScalePlaneUp2_12_Bilinear(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + void ScalePlaneUp2_16_Linear(int src_width, int src_height, int dst_width, @@ -1476,9 +1577,9 @@ void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 - if (TestCpuFlag(kCpuHasSSSE3)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3; +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; } #endif @@ -1508,11 +1609,6 @@ void ScalePlaneUp2_16_Linear(int src_width, } } -// Scale at most 12 bit plane, up by 2 times. -// This is an optimized version for scaling up a plane to 2 times of -// its original size, using bilinear interpolation. -// stride is in count of uint16_t. -// This is used to scale U and V planes of I010 to I410 and I012 to I412. void ScalePlaneUp2_16_Bilinear(int src_width, int src_height, int dst_width, @@ -1523,14 +1619,14 @@ void ScalePlaneUp2_16_Bilinear(int src_width, uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = - ScaleRowUp2_Bilinear_16_Any_C; + ScaleRowUp2_Bilinear_16_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3; } @@ -1945,6 +2041,17 @@ void ScalePlane_16(const uint16_t* src, dst_stride, src, dst); return; } + if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { + ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1981,13 +2088,13 @@ void ScalePlane_12(const uint16_t* src, } if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { - ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, + ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { - ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, + ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } diff --git a/source/scale_any.cc b/source/scale_any.cc index d30f58336..7a7af2480 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -656,14 +656,22 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 -SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3, - ScaleRowUp2_Linear_16_SSSE3, +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, + ScaleRowUp2_Linear_12_SSSE3, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, + ScaleRowUp2_Linear_16_SSE2, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_AVX2 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, ScaleRowUp2_Linear_AVX2, @@ -672,11 +680,19 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 +SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, + ScaleRowUp2_Linear_12_AVX2, + ScaleRowUp2_Linear_16_C, + 31, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, ScaleRowUp2_Linear_16_AVX2, ScaleRowUp2_Linear_16_C, - 31, + 15, uint16_t) #endif @@ -688,11 +704,19 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEROWUP2LINEAR_12_NEON +SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, + ScaleRowUp2_Linear_12_NEON, + ScaleRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2LINEAR_16_NEON SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, ScaleRowUp2_Linear_16_NEON, ScaleRowUp2_Linear_16_C, - 15, + 7, uint16_t) #endif @@ -744,14 +768,22 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, - ScaleRowUp2_Bilinear_16_SSSE3, +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, + ScaleRowUp2_Bilinear_12_SSSE3, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, + ScaleRowUp2_Bilinear_16_SSE2, + ScaleRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, @@ -768,6 +800,14 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, + ScaleRowUp2_Bilinear_12_AVX2, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, ScaleRowUp2_Bilinear_16_AVX2, @@ -784,11 +824,19 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEROWUP2BILINEAR_12_NEON +SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, + ScaleRowUp2_Bilinear_12_NEON, + ScaleRowUp2_Bilinear_16_C, + 15, + uint16_t) +#endif + #ifdef HAS_SCALEROWUP2BILINEAR_16_NEON SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, ScaleRowUp2_Bilinear_16_NEON, ScaleRowUp2_Bilinear_16_C, - 15, + 7, uint16_t) #endif @@ -860,7 +908,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, ScaleUVRowUp2_Linear_NEON, ScaleUVRowUp2_Linear_C, - 7, + 15, uint8_t) #endif @@ -868,7 +916,7 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, ScaleUVRowUp2_Linear_16_NEON, ScaleUVRowUp2_Linear_16_C, - 7, + 15, uint16_t) #endif @@ -966,7 +1014,7 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, ScaleUVRowUp2_Bilinear_16_NEON, ScaleUVRowUp2_Bilinear_16_C, - 3, + 7, uint16_t) #endif diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index f03903f0b..b1d39cf89 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -950,8 +950,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 -void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 +void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( @@ -1000,8 +1000,8 @@ void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 -void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 +void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -1045,11 +1045,11 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, "paddw %%xmm3,%%xmm5 \n" // near+far "paddw %%xmm1,%%xmm1 \n" // 2*near "paddw %%xmm3,%%xmm3 \n" // 2*near - "paddw %%xmm4,%%xmm1 \n" // 3*near+far (1, lo) - "paddw %%xmm5,%%xmm3 \n" // 3*near+far (1, hi) + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) - // xmm4 xmm1 xmm0 xmm2 - // xmm5 xmm2 xmm1 xmm3 + // xmm0 xmm2 + // xmm1 xmm3 "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm1,%%xmm5 \n" @@ -1099,6 +1099,166 @@ void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, } #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packssdw %%xmm1,%%xmm0 \n" + "pshufd $0b11011000,%%xmm0,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 +void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0),%%xmm0 \n" // 0123 (16b) + "movq 2(%0),%%xmm1 \n" // 1234 (16b) + "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) + "paddd %%xmm0,%%xmm2 \n" // near+far (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 2(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) + "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) + "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) + "paddd %%xmm2,%%xmm4 \n" // near+far (lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packssdw %%xmm0,%%xmm4 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packssdw %%xmm2,%%xmm5 \n" + "pshufd $0b11011000,%%xmm4,%%xmm4 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -1352,8 +1512,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 -void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 +void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( @@ -1402,8 +1562,8 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 -void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, +#ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 +void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -1466,6 +1626,139 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, } #endif +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 +void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) + "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) + "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) + "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + // Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, @@ -2522,7 +2815,6 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( - "vpxor %%xmm5,%%xmm5,%%xmm5 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2532,11 +2824,8 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 - - "vpunpcklwd %%ymm5,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) @@ -2564,7 +2853,7 @@ void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif @@ -2575,7 +2864,6 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( - "vpxor %%xmm7,%%xmm7,%%xmm7 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 @@ -2585,10 +2873,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 - "vpunpcklwd %%ymm7,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm7,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) @@ -2600,10 +2886,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0011000022330000 - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1122000033440000 - "vpunpcklwd %%ymm7,%%ymm2,%%ymm2 \n" // 00112233 (32b, 1u1v) - "vpunpcklwd %%ymm7,%%ymm3,%%ymm3 \n" // 11223344 (32b, 1u1v) + "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) + "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) @@ -2652,8 +2936,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 41dba3e8e..6a0d6e1b4 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -603,7 +603,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; @@ -633,7 +633,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, ); } -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -647,7 +647,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmov.u16 q15, #3 \n" "1: \n" - "add %5, %0, #2 \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) @@ -655,7 +654,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) - "add %5, %1, #2 \n" "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) @@ -692,6 +690,102 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "vmov.u16 d31, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) + "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) + + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 4567 (32b) + "vmovl.u16 q4, d2 \n" // 1234 (32b) + "vmovl.u16 q5, d3 \n" // 5678 (32b) + + "vmlal.u16 q2, d2, d31 \n" + "vmlal.u16 q3, d3, d31 \n" + "vmlal.u16 q4, d0, d31 \n" + "vmlal.u16 q5, d1, d31 \n" + + "vrshrn.u32 d0, q4, #2 \n" + "vrshrn.u32 d1, q5, #2 \n" + "vrshrn.u32 d2, q2, #2 \n" + "vrshrn.u32 d3, q3, #2 \n" + + "vst2.16 {q0, q1}, [%1]! \n" // store + "subs %2, %2, #16 \n" // 8 sample -> 16 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "vmov.u16 d31, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.16 {d0}, [%0]! \n" // 0123 (16b) + "vld1.16 {d1}, [%5]! \n" // 1234 (16b) + "vmovl.u16 q2, d0 \n" // 0123 (32b) + "vmovl.u16 q3, d1 \n" // 1234 (32b) + "vmlal.u16 q2, d1, d31 \n" + "vmlal.u16 q3, d0, d31 \n" + + "vld1.16 {d0}, [%1]! \n" // 0123 (16b) + "vld1.16 {d1}, [%6]! \n" // 1234 (16b) + "vmovl.u16 q4, d0 \n" // 0123 (32b) + "vmovl.u16 q5, d1 \n" // 1234 (32b) + "vmlal.u16 q4, d1, d31 \n" + "vmlal.u16 q5, d0, d31 \n" + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" + "vmla.u32 q5, q3, q14 \n" + "vmla.u32 q2, q0, q14 \n" + "vmla.u32 q3, q1, q14 \n" + + "vrshrn.u32 d1, q4, #4 \n" + "vrshrn.u32 d0, q5, #4 \n" + "vrshrn.u32 d3, q2, #4 \n" + "vrshrn.u32 d2, q3, #4 \n" + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 sample -> 8 sample + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d31" // Clobber List + ); +} + void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 22fedcb5a..cde4ee39b 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -630,7 +630,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } -void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; @@ -661,7 +661,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, ); } -void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -721,6 +721,106 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 1; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) + "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) + "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) + + "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far + "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) + + "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store + "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List + ); +} + +void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 1; + const uint16_t* src_temp1 = src_ptr1 + 1; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 0123 (16b) + "ldr d1, [%2], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" // 0123 (16b) + "ldr d1, [%3], #8 \n" // 1234 (16b) + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) + "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.4s, v4.4s \n" + "mov v1.4s, v5.4s \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far + "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far + + "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 + "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 + + "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index d24806a66..066bcfde6 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -259,6 +259,123 @@ static int I420TestFilter_12(int src_width, return max_diff; } +// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. +// 0 = exact. +static int I420TestFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i; + int src_width_uv = (Abs(src_width) + 1) >> 1; + int src_height_uv = (Abs(src_height) + 1) >> 1; + + int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); + + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(src_u_16, src_uv_plane_size * 2); + align_buffer_page_end(src_v_16, src_uv_plane_size * 2); + if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + uint16_t* p_src_y_16 = reinterpret_cast(src_y_16); + uint16_t* p_src_u_16 = reinterpret_cast(src_u_16); + uint16_t* p_src_v_16 = reinterpret_cast(src_v_16); + + MemRandomize(src_y, src_y_plane_size); + MemRandomize(src_u, src_uv_plane_size); + MemRandomize(src_v, src_uv_plane_size); + + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i]; + } + for (i = 0; i < src_uv_plane_size; ++i) { + p_src_u_16[i] = src_u[i]; + p_src_v_16[i] = src_v[i]; + } + + int dst_width_uv = (dst_width + 1) >> 1; + int dst_height_uv = (dst_height + 1) >> 1; + + int dst_y_plane_size = (dst_width) * (dst_height); + int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; + + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_u_8, dst_uv_plane_size); + align_buffer_page_end(dst_v_8, dst_uv_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); + + uint16_t* p_dst_y_16 = reinterpret_cast(dst_y_16); + uint16_t* p_dst_u_16 = reinterpret_cast(dst_u_16); + uint16_t* p_dst_v_16 = reinterpret_cast(dst_v_16); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, + dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (i = 0; i < benchmark_iterations; ++i) { + I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, + p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, + dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, + dst_stride_uv, dst_width, dst_height, f); + } + + // Expect an exact match. + int max_diff = 0; + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_u_8); + free_aligned_buffer_page_end(dst_v_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(dst_u_16); + free_aligned_buffer_page_end(dst_v_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + free_aligned_buffer_page_end(src_y_16); + free_aligned_buffer_page_end(src_u_16); + free_aligned_buffer_page_end(src_v_16); + + return max_diff; +} + // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int I444TestFilter(int src_width, int src_height, @@ -494,6 +611,123 @@ static int I444TestFilter_12(int src_width, return max_diff; } +// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. +// 0 = exact. +static int I444TestFilter_16(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i; + int src_width_uv = Abs(src_width); + int src_height_uv = Abs(src_height); + + int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); + + int src_stride_y = Abs(src_width); + int src_stride_uv = src_width_uv; + + align_buffer_page_end(src_y, src_y_plane_size); + align_buffer_page_end(src_u, src_uv_plane_size); + align_buffer_page_end(src_v, src_uv_plane_size); + align_buffer_page_end(src_y_16, src_y_plane_size * 2); + align_buffer_page_end(src_u_16, src_uv_plane_size * 2); + align_buffer_page_end(src_v_16, src_uv_plane_size * 2); + if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + uint16_t* p_src_y_16 = reinterpret_cast(src_y_16); + uint16_t* p_src_u_16 = reinterpret_cast(src_u_16); + uint16_t* p_src_v_16 = reinterpret_cast(src_v_16); + + MemRandomize(src_y, src_y_plane_size); + MemRandomize(src_u, src_uv_plane_size); + MemRandomize(src_v, src_uv_plane_size); + + for (i = 0; i < src_y_plane_size; ++i) { + p_src_y_16[i] = src_y[i]; + } + for (i = 0; i < src_uv_plane_size; ++i) { + p_src_u_16[i] = src_u[i]; + p_src_v_16[i] = src_v[i]; + } + + int dst_width_uv = dst_width; + int dst_height_uv = dst_height; + + int dst_y_plane_size = (dst_width) * (dst_height); + int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); + + int dst_stride_y = dst_width; + int dst_stride_uv = dst_width_uv; + + align_buffer_page_end(dst_y_8, dst_y_plane_size); + align_buffer_page_end(dst_u_8, dst_uv_plane_size); + align_buffer_page_end(dst_v_8, dst_uv_plane_size); + align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); + align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); + align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); + + uint16_t* p_dst_y_16 = reinterpret_cast(dst_y_16); + uint16_t* p_dst_u_16 = reinterpret_cast(dst_u_16); + uint16_t* p_dst_v_16 = reinterpret_cast(dst_v_16); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, + src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, + dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + for (i = 0; i < benchmark_iterations; ++i) { + I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, + p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, + dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, + dst_stride_uv, dst_width, dst_height, f); + } + + // Expect an exact match. + int max_diff = 0; + for (i = 0; i < dst_y_plane_size; ++i) { + int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + for (i = 0; i < dst_uv_plane_size; ++i) { + int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(dst_y_8); + free_aligned_buffer_page_end(dst_u_8); + free_aligned_buffer_page_end(dst_v_8); + free_aligned_buffer_page_end(dst_y_16); + free_aligned_buffer_page_end(dst_u_16); + free_aligned_buffer_page_end(dst_v_16); + free_aligned_buffer_page_end(src_y); + free_aligned_buffer_page_end(src_u); + free_aligned_buffer_page_end(src_v); + free_aligned_buffer_page_end(src_y_16); + free_aligned_buffer_page_end(src_u_16); + free_aligned_buffer_page_end(src_v_16); + + return max_diff; +} + // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int NV12TestFilter(int src_width, int src_height, @@ -700,6 +934,20 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ + int diff = I420TestFilter_16( \ + benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ + int diff = I444TestFilter_16( \ + benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -736,6 +984,22 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ + int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ + int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \ int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ @@ -801,6 +1065,20 @@ TEST_SCALETO(Scale, 1920, 1080) disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \ + int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \ + int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \ + benchmark_height_, benchmark_width_, \ + kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \