From d4ecb70610325fdaaeec6af074d6e3ceab9866d1 Mon Sep 17 00:00:00 2001 From: Yuan Tong Date: Fri, 12 Feb 2021 10:49:25 +0800 Subject: [PATCH] Add P010ToP410 and P210ToP410 These are 16 bit bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter. libyuv_unittest --gtest_filter=*ToP41* R=fbarchard@chromium.org Bug: libyuv:872 Change-Id: I3cb4fafe2b2c9eedd0d91cf4c619abb9ee107bc1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2690102 Reviewed-by: Frank Barchard --- README.chromium | 2 +- include/libyuv/convert.h | 38 ++ include/libyuv/scale_row.h | 82 +++- include/libyuv/scale_uv.h | 13 + include/libyuv/version.h | 2 +- source/convert.cc | 49 +++ source/row_gcc.cc | 65 +-- source/scale.cc | 54 ++- source/scale_any.cc | 74 +++- source/scale_common.cc | 58 +++ source/scale_gcc.cc | 814 ++++++++++++++++++++++--------------- source/scale_neon.cc | 96 +++++ source/scale_neon64.cc | 104 ++++- source/scale_uv.cc | 210 +++++++++- unit_test/convert_test.cc | 190 +++++---- 15 files changed, 1353 insertions(+), 498 deletions(-) diff --git a/README.chromium b/README.chromium index b96e82397..25445f1a2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1776 +Version: 1777 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 7322300da..4e58ad6e0 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -315,6 +315,44 @@ int NV16ToNV24(const uint8_t* src_y, int width, int height); +// Convert P010 to P410. +LIBYUV_API +int P010ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert P012 to P412. +#define P012ToP412 P010ToP410 + +// Convert P016 to P416. +#define P016ToP416 P010ToP410 + +// Convert P210 to P410. +LIBYUV_API +int P210ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert P212 to P412. +#define P212ToP412 P210ToP410 + +// Convert P216 to P416. +#define P216ToP416 P210ToP410 + // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 92759b2b5..e972b5337 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -81,10 +81,12 @@ extern "C" { #define HAS_SCALEROWUP2LINEAR_SSSE3 #define HAS_SCALEROWUP2BILINEAR_SSE2 #define HAS_SCALEROWUP2BILINEAR_SSSE3 -#define HAS_SCALEROWUP2LINEAR_16_SSE2 -#define HAS_SCALEROWUP2BILINEAR_16_SSE2 +#define HAS_SCALEROWUP2LINEAR_16_SSSE3 +#define HAS_SCALEROWUP2BILINEAR_16_SSSE3 #define HAS_SCALEUVROWUP2LINEAR_SSSE3 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3 +#define HAS_SCALEUVROWUP2LINEAR_16_SSE2 +#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2 #endif // The following are available for gcc/clang x86 platforms, but @@ -100,6 +102,8 @@ extern "C" { #define HAS_SCALEROWUP2BILINEAR_16_AVX2 #define HAS_SCALEUVROWUP2LINEAR_AVX2 #define HAS_SCALEUVROWUP2BILINEAR_AVX2 +#define HAS_SCALEUVROWUP2LINEAR_16_AVX2 +#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2 #endif // The following are available on all x86 platforms, but @@ -134,6 +138,8 @@ extern "C" { #define HAS_SCALEROWUP2BILINEAR_16_NEON #define HAS_SCALEUVROWUP2LINEAR_NEON #define HAS_SCALEUVROWUP2BILINEAR_NEON +#define HAS_SCALEUVROWUP2LINEAR_16_NEON +#define HAS_SCALEUVROWUP2BILINEAR_16_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -487,6 +493,22 @@ void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, @@ -589,10 +611,10 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -629,10 +651,10 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); -void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, +void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); -void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, +void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, @@ -1235,6 +1257,54 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); +void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. diff --git a/include/libyuv/scale_uv.h b/include/libyuv/scale_uv.h index 1b6327aae..7b212178a 100644 --- a/include/libyuv/scale_uv.h +++ b/include/libyuv/scale_uv.h @@ -30,6 +30,19 @@ int UVScale(const uint8_t* src_uv, int dst_height, enum FilterMode filtering); +// Scale an 16 bit UV image. +// This function is currently incomplete, it can't handle all cases. +LIBYUV_API +int UVScale_16(const uint16_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint16_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6073df8f5..911f038c8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1776 +#define LIBYUV_VERSION 1777 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 06d312f96..8bf02b76a 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -663,6 +663,55 @@ int NV16ToNV24(const uint8_t* src_y, return 0; } +LIBYUV_API +int P010ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int P210ToP410(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint16_t* dst_y, + int dst_stride_y, + uint16_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 1aea6db9e..c7e3fb959 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4190,6 +4190,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, "lea 64(%4),%4 \n" "sub $0x10,%5 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -4231,6 +4232,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, "lea 64(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -4340,9 +4342,9 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, } #endif +static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, + 2, 6, 10, 14, 3, 7, 11, 15}; #ifdef HAS_SPLITARGBROW_SSSE3 -static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, - 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u}; void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -4351,6 +4353,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, int width) { asm volatile( + "movdqa %6,%%xmm3 \n" "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -4360,8 +4363,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %6,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %6,%%xmm1 \n" // 048C159D26AE37BF (hi) + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) @@ -4385,7 +4388,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } void SplitXRGBRow_SSSE3(const uint8_t* src_argb, @@ -4395,13 +4398,15 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, int width) { asm volatile( + "movdqa %5,%%xmm3 \n" + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F - "pshufb %5,%%xmm0 \n" // 048C159D26AE37BF (lo) - "pshufb %5,%%xmm1 \n" // 048C159D26AE37BF (hi) + "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) + "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) @@ -4421,16 +4426,12 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, "+r"(dst_b), // %3 "+r"(width) // %4 : "m"(kShuffleMaskARGBSplit) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } #endif #ifdef HAS_SPLITARGBROW_AVX2 -static const lvec8 kShuffleMaskARGBSplit_AVX2 = { - 0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u, - 0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u}; -static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u, - 2u, 6u, 3u, 7u}; +static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -4442,7 +4443,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" - "vmovdqu %7,%%ymm3 \n" + "vmovdqa %7,%%ymm3 \n" + "vbroadcastf128 %6,%%ymm4 \n" LABELALIGN "1: \n" @@ -4451,8 +4453,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "vmovdqu 16(%0),%%xmm1 \n" // 10-1F "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F - "vpshufb %6,%%ymm0,%%ymm0 \n" - "vpshufb %6,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" "vpermd %%ymm0,%%ymm3,%%ymm0 \n" "vpermd %%ymm1,%%ymm3,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA @@ -4465,6 +4467,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "lea 16(%1),%1 \n" "subl $0x10,%5 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -4475,9 +4478,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, #else "+rm"(width) // %5 #endif - : "m"(kShuffleMaskARGBSplit_AVX2), // %6 - "m"(kShuffleMaskARGBPermute_AVX2) // %7 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "m"(kShuffleMaskARGBSplit), // %6 + "m"(kShuffleMaskARGBPermute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } void SplitXRGBRow_AVX2(const uint8_t* src_argb, @@ -4487,15 +4490,18 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, int width) { asm volatile( - "vmovdqu %6,%%ymm3 \n" LABELALIGN + "vmovdqa %6,%%ymm3 \n" + "vbroadcastf128 %5,%%ymm4 \n" + + LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00-0F "vmovdqu 16(%0),%%xmm1 \n" // 10-1F "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F - "vpshufb %5,%%ymm0,%%ymm0 \n" - "vpshufb %5,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" "vpermd %%ymm0,%%ymm3,%%ymm0 \n" "vpermd %%ymm1,%%ymm3,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA @@ -4510,13 +4516,14 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, "lea 16(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "m"(kShuffleMaskARGBSplit_AVX2), // %5 - "m"(kShuffleMaskARGBPermute_AVX2) // %6 + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "m"(kShuffleMaskARGBSplit), // %5 + "m"(kShuffleMaskARGBPermute) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } #endif diff --git a/source/scale.cc b/source/scale.cc index 226024cd8..84c78711a 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1441,20 +1441,16 @@ void ScalePlaneUp2_Bilinear(int src_width, } #endif - if (src_height == 1) { - Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); - } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO: Test performance of writing one row of destination at a time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } } } @@ -1480,9 +1476,9 @@ void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; +#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3; } #endif @@ -1534,9 +1530,9 @@ void ScalePlaneUp2_16_Bilinear(int src_width, assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 - if (TestCpuFlag(kCpuHasSSE2)) { - Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3; } #endif @@ -1552,19 +1548,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width, } #endif - if (src_height == 1) { - Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); - } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - } } } diff --git a/source/scale_any.cc b/source/scale_any.cc index 47b283863..d30f58336 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -656,9 +656,9 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 -SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, - ScaleRowUp2_Linear_16_SSE2, +#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 +SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3, + ScaleRowUp2_Linear_16_SSSE3, ScaleRowUp2_Linear_16_C, 15, uint16_t) @@ -676,7 +676,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, ScaleRowUp2_Linear_16_AVX2, ScaleRowUp2_Linear_16_C, - 15, + 31, uint16_t) #endif @@ -744,9 +744,9 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 -SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, - ScaleRowUp2_Bilinear_16_SSE2, +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 +SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, + ScaleRowUp2_Bilinear_16_SSSE3, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) @@ -818,6 +818,12 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, 0, uint8_t) +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, + ScaleUVRowUp2_Linear_16_C, + ScaleUVRowUp2_Linear_16_C, + 0, + uint16_t) + #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, ScaleUVRowUp2_Linear_SSSE3, @@ -834,6 +840,22 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2, + ScaleUVRowUp2_Linear_16_SSE2, + ScaleUVRowUp2_Linear_16_C, + 3, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, + ScaleUVRowUp2_Linear_16_AVX2, + ScaleUVRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEUVROWUP2LINEAR_NEON SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, ScaleUVRowUp2_Linear_NEON, @@ -842,6 +864,14 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, + ScaleUVRowUp2_Linear_16_NEON, + ScaleUVRowUp2_Linear_16_C, + 7, + uint16_t) +#endif + #undef SBUH2LANY // Scale bi-planar plane up 2 times using bilinear filter. @@ -886,6 +916,12 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, 0, uint8_t) +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, + ScaleUVRowUp2_Bilinear_16_C, + ScaleUVRowUp2_Bilinear_16_C, + 0, + uint16_t) + #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, ScaleUVRowUp2_Bilinear_SSSE3, @@ -902,6 +938,22 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, uint8_t) #endif +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2, + ScaleUVRowUp2_Bilinear_16_SSE2, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, + ScaleUVRowUp2_Bilinear_16_AVX2, + ScaleUVRowUp2_Bilinear_16_C, + 7, + uint16_t) +#endif + #ifdef HAS_SCALEUVROWUP2BILINEAR_NEON SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, ScaleUVRowUp2_Bilinear_NEON, @@ -910,6 +962,14 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, uint8_t) #endif +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, + ScaleUVRowUp2_Bilinear_16_NEON, + ScaleUVRowUp2_Bilinear_16_C, + 3, + uint16_t) +#endif + #undef SBU2BLANY #ifdef __cplusplus diff --git a/source/scale_common.cc b/source/scale_common.cc index f4f233973..da96d4286 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1258,6 +1258,64 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, } } +void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* s = src_ptr; + const uint16_t* t = src_ptr + src_stride; + uint16_t* d = dst_ptr; + uint16_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> + 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> + 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> + 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> + 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> + 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> + 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> + 4; + } +} + // Scales a single row of pixels using point sampling. void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 9563e5bb6..d1fb7de1e 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -774,6 +774,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "xmm6", "xmm7"); } +static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, + 10, 11, 8, 9, 14, 15, 12, 13}; + +static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, + 3, 1, 1, 3, 3, 1, 1, 3}; + #ifdef HAS_SCALEROWUP2LINEAR_SSE2 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -896,7 +902,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, "movdqa %%xmm4,%%xmm3 \n" "movdqa %%xmm5,%%xmm6 \n" - "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) @@ -904,7 +910,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, "movdqa %%xmm1,%%xmm7 \n" "movdqa %%xmm2,%%xmm6 \n" - "psllw $1,%%xmm7 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) @@ -915,14 +921,14 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, "movdqa %%xmm5,%%xmm3 \n" "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) - "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) "psrlw $4,%%xmm5 \n" // ^ div by 16 "movdqa %%xmm2,%%xmm3 \n" "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) - "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi) + "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) "psrlw $4,%%xmm2 \n" // ^ div by 16 @@ -944,148 +950,140 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 -void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { +#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3 +void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 - "pcmpeqw %%xmm6,%%xmm6 \n" - "psrlw $15,%%xmm6 \n" - "psllw $1,%%xmm6 \n" // all 2 + "movdqa %3,%%xmm5 \n" + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 LABELALIGN "1: \n" - "movdqu (%0),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm2 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm4 \n" - "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "paddw %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) - "psllw $1,%%xmm5 \n" - "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) - "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) - "movdqu %%xmm5,(%1) \n" + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) - "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16) - "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "paddw %%xmm6,%%xmm1 \n" - "psllw $1,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) - "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) - "movdqu %%xmm1,0x10(%1) \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) + + "paddw %%xmm4,%%xmm1 \n" // far+2 + "paddw %%xmm4,%%xmm3 \n" // far+2 + "paddw %%xmm0,%%xmm1 \n" // near+far+2 + "paddw %%xmm2,%%xmm3 \n" // near+far+2 + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) + + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm2,16(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif -#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 -void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3 +void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { asm volatile( - "pxor %%xmm0,%%xmm0 \n" // 0 "pcmpeqw %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" "psllw $3,%%xmm7 \n" // all 8 + "movdqa %5,%%xmm6 \n" LABELALIGN "1: \n" // above line - "movdqu (%0),%%xmm1 \n" // 01234567 (16) - "movdqu 2(%0),%%xmm2 \n" // 12345678 (16) - "movdqa %%xmm1,%%xmm4 \n" - "punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm4 \n" - "movdqa %%xmm1,%%xmm5 \n" - "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) - - "movdqa %%xmm1,%%xmm3 \n" - "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) - "punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16) - "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm2,%%xmm1 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + "movdqu (%0),%%xmm0 \n" // 01234567 (16) + "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) + "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) + "paddw %%xmm0,%%xmm1 \n" // near+far + "paddw %%xmm2,%%xmm3 \n" // near+far + "paddw %%xmm0,%%xmm0 \n" // 2*near + "paddw %%xmm2,%%xmm2 \n" // 2*near + "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) + "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) // below line - "movdqu (%0,%3,2),%%xmm6 \n" // 01234567 (16) - "movdqu 2(%0,%3,2),%%xmm2 \n" // 12345678 (16) - "movdqa %%xmm6,%%xmm3 \n" - "punpcklwd %%xmm3,%%xmm3 \n" // 00112233 (16) - "movdqa %%xmm2,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16) - "paddw %%xmm5,%%xmm3 \n" - "movdqa %%xmm6,%%xmm5 \n" - "punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16) - "paddw %%xmm5,%%xmm5 \n" - "paddw %%xmm3,%%xmm5 \n" // 3*near+far (2, lo) - - "movdqa %%xmm6,%%xmm3 \n" - "punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16) - "punpckhwd %%xmm6,%%xmm6 \n" // 44556677 (16) - "punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16) - "paddw %%xmm6,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) - - // xmm4 xmm1 - // xmm5 xmm2 - - "movdqa %%xmm4,%%xmm3 \n" - "movdqa %%xmm5,%%xmm6 \n" - "psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo) - "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, lo) - "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) - "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,(%1) \n" - + "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) + "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) "movdqa %%xmm1,%%xmm3 \n" - "movdqa %%xmm2,%%xmm6 \n" - "psllw $1,%%xmm3 \n" // 6*near+2*far (1, hi) - "paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, hi) - "paddw %%xmm1,%%xmm3 \n" // 9*near+3*far (1, hi) - "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, hi) + "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) + "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) + "movdqa %%xmm3,%%xmm5 \n" + "movdqa %%xmm1,%%xmm4 \n" + "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) + "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) + "paddw %%xmm1,%%xmm4 \n" // near+far + "paddw %%xmm3,%%xmm5 \n" // near+far + "paddw %%xmm1,%%xmm1 \n" // 2*near + "paddw %%xmm3,%%xmm3 \n" // 2*near + "paddw %%xmm4,%%xmm1 \n" // 3*near+far (1, lo) + "paddw %%xmm5,%%xmm3 \n" // 3*near+far (1, hi) + + // xmm4 xmm1 xmm0 xmm2 + // xmm5 xmm2 xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,(%1) \n" + + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) + "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm4 \n" // ^ div by 16 + "movdqu %%xmm4,0x10(%1) \n" + + "movdqa %%xmm1,%%xmm4 \n" + "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) + "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm1 \n" // ^ div by 16 + "movdqu %%xmm1,(%1,%4,2) \n" + + "movdqa %%xmm3,%%xmm4 \n" + "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) + "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) "psrlw $4,%%xmm3 \n" // ^ div by 16 - "movdqu %%xmm3,0x10(%1) \n" - - "movdqa %%xmm5,%%xmm3 \n" - "paddw %%xmm7,%%xmm4 \n" // 3*near+far+8 (1, lo) - "psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo) - "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) - "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) - "psrlw $4,%%xmm5 \n" // ^ div by 16 - "movdqu %%xmm5,(%1,%4,2) \n" - - "movdqa %%xmm2,%%xmm3 \n" - "paddw %%xmm7,%%xmm1 \n" // 3*near+far+8 (1, hi) - "psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi) - "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) - "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) - "psrlw $4,%%xmm2 \n" // ^ div by 16 - "movdqu %%xmm2,0x10(%1,%4,2) \n" + "movdqu %%xmm3,0x10(%1,%4,2) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 sample to 16 sample @@ -1095,16 +1093,13 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + "r"((intptr_t)(dst_stride)), // %4 + "m"(kLinearShuffleFar) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 -static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3, - 3, 1, 1, 3, 3, 1, 1, 3}; - void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -1112,7 +1107,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 - "movdqu %3,%%xmm3 \n" + "movdqa %3,%%xmm3 \n" LABELALIGN "1: \n" @@ -1136,10 +1131,10 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31_SSSE3) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif @@ -1154,7 +1149,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 - "movdqu %5,%%xmm7 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN "1: \n" @@ -1225,17 +1220,13 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31_SSSE3) // %5 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_AVX2 -static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, - 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, - 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; - void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -1243,7 +1234,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vmovdqu %3,%%ymm3 \n" + "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN "1: \n" @@ -1268,10 +1259,11 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31_AVX2) // %3 + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif @@ -1286,7 +1278,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vmovdqu %5,%%ymm7 \n" + "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN "1: \n" @@ -1348,100 +1340,65 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31_AVX2) // %5 + "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 -static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, - 3, 1, 1, 3, 3, 1, 1, 3}; - void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( - "vmovdqu %3,%%ymm3 \n" + "vbroadcastf128 %3,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) + "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) + "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 + + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 + "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 - "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 - "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 - "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 - "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo) - "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far - "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far + "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,32(%1) \n" - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" // 16 sample to 32 sample + "sub $0x20,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31_16_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} - -// This version can handle full 16bit range but is slower -void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr, - uint16_t* dst_ptr, - int dst_width) { - asm volatile( - "vmovdqu %3,%%ymm3 \n" - "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrld $31,%%ymm4,%%ymm4 \n" - "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 - - LABELALIGN - "1: \n" - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) - - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 - "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 - "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 - "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 - "vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo) - "vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) - "vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) - "vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) - "vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) - "vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) - "vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kLinearMadd31_16_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kLinearShuffleFar) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif @@ -1452,7 +1409,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( - "vmovdqu %5,%%ymm5 \n" + "vbroadcastf128 %5,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 @@ -1464,25 +1421,21 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 - "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 - "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 - "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 - "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo) - "vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - "vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 - "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 - "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878 - "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 - "vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo) - "vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi) - "vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2) + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) + "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) + "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far + "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near + "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) @@ -1502,94 +1455,15 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31_16_AVX2) // %5 + "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } - -// This version can handle full 16bit range but is slower. -void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr, - ptrdiff_t src_stride, - uint16_t* dst_ptr, - ptrdiff_t dst_stride, - int dst_width) { - asm volatile( - "vmovdqu %5,%%ymm7 \n" - "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" - "vpsrld $31,%%ymm6,%%ymm6 \n" - "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 - - LABELALIGN - "1: \n" - - "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) - "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 - "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767 - "vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878 - "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878 - "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656 - "vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo) - "vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) - - "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b) - "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b) - "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000 - "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000 - "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767 - "vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878 - "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878 - "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656 - "vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo) - "vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) - - "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) - "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) - "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) - "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) - "vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) - - "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) - "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) - "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) - "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) - "vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) - - "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) - "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) - "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) - "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) - "vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) - - "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) - "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) - "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) - "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) - "vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) - - "vpackssdw %%ymm0,%%ymm4,%%ymm4 \n" - "vmovdqu %%ymm4,(%1) \n" // store above - "vpackssdw %%ymm2,%%ymm5,%%ymm5 \n" - "vmovdqu %%ymm5,(%1,%4,2) \n" // store below - - "lea 0x10(%0),%0 \n" - "lea 0x20(%1),%1 \n" // 8 sample to 16 sample - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)), // %4 - "m"(kLinearMadd31_16_AVX2) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} #endif // Reads 16xN bytes and produces 16 shorts at a time. @@ -2249,9 +2123,10 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 +static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, + 1, 1, 3, 1, 3}; + #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 -static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3, - 3, 1, 3, 1, 1, 3, 1, 3}; void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -2259,7 +2134,7 @@ void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 - "movdqu %3,%%xmm3 \n" + "movdqa %3,%%xmm3 \n" LABELALIGN "1: \n" @@ -2275,17 +2150,17 @@ void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) - "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n" - "vmovdqu %%xmm0,(%1) \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31_SSSE3) // %3 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif @@ -2300,7 +2175,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 - "movdqu %5,%%xmm7 \n" + "movdqa %5,%%xmm7 \n" LABELALIGN "1: \n" @@ -2369,16 +2244,13 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kUVLinearMadd31_SSSE3) // %5 + "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 -static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, - 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3, - 1, 3, 3, 1, 3, 1, 1, 3, 1, 3}; void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -2387,7 +2259,7 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 - "vmovdqu %3,%%ymm3 \n" + "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN "1: \n" @@ -2411,10 +2283,11 @@ void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kUVLinearMadd31_AVX2) // %3 + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif @@ -2429,7 +2302,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 - "vmovdqu %5,%%ymm7 \n" + "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN "1: \n" @@ -2489,12 +2362,297 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 - "m"(kUVLinearMadd31_AVX2) // %5 + "m"(kUVLinearMadd31) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 +void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "pxor %%xmm5,%%xmm5 \n" + "pcmpeqd %%xmm4,%%xmm4 \n" + "psrld $31,%%xmm4 \n" + "pslld $1,%%xmm4 \n" // all 2 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + + "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) + "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) + + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) + + "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) + "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) + "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) + "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) + + "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) + "packusdw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 +void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pxor %%xmm7,%%xmm7 \n" + "pcmpeqd %%xmm6,%%xmm6 \n" + "psrld $31,%%xmm6 \n" + "pslld $3,%%xmm6 \n" // all 8 + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) + "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) + "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) + "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) + "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) + "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) + "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) + "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) + "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) + "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) + + "movq (%0,%3,2),%%xmm2 \n" + "movq 4(%0,%3,2),%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) + "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) + "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) + "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) + "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) + "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) + "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) + "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm2,%%xmm5 \n" + "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm1,%%xmm0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) + "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm2 \n" + "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) + "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) + "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) + "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) + "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) + + "packusdw %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packusdw %%xmm2,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4,2) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 2 uv to 4 uv + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 +void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + asm volatile( + "vpxor %%xmm5,%%xmm5,%%xmm5 \n" + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrld $31,%%ymm4,%%ymm4 \n" + "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 + + "vpunpcklwd %%ymm5,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + + "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) + "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) + + "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 +void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpxor %%xmm7,%%xmm7,%%xmm7 \n" + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrld $31,%%ymm6,%%ymm6 \n" + "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 + + LABELALIGN + "1: \n" + + "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0011000022330000 + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1122000033440000 + "vpunpcklwd %%ymm7,%%ymm0,%%ymm0 \n" // 00112233 (32b, 1u1v) + "vpunpcklwd %%ymm7,%%ymm1,%%ymm1 \n" // 11223344 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) + "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) + "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) + "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) + "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) + + "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) + "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0011000022330000 + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1122000033440000 + "vpunpcklwd %%ymm7,%%ymm2,%%ymm2 \n" // 00112233 (32b, 1u1v) + "vpunpcklwd %%ymm7,%%ymm3,%%ymm3 \n" // 11223344 (32b, 1u1v) + "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) + "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) + "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) + "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) + "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) + "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) + "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) + + "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4,2) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } diff --git a/source/scale_neon.cc b/source/scale_neon.cc index e65654d92..14d8fcd89 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -791,6 +791,102 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u16 d30, #3 \n" + + "1: \n" + "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) + "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) + + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) + "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) + "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) + "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) + "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) + "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) + + "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) + "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.32 {d0, d1}, [%1]! \n" // store + "vst2.32 {d2, d3}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 d30, #3 \n" + "vmov.u32 q14, #3 \n" + + "1: \n" + "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) + "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) + "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) + + "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) + "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) + "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) + "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) + "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) + "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) + + "vmovq q0, q4 \n" + "vmovq q1, q5 \n" + "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) + "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) + "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) + "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) + + "vrshrn.u32 d1, q4, #4 \n" // 1, odd + "vrshrn.u32 d0, q5, #4 \n" // 1, even + "vrshrn.u32 d3, q2, #4 \n" // 2, odd + "vrshrn.u32 d2, q3, #4 \n" // 2, even + + "vst2.32 {d0, d1}, [%2]! \n" // store + "vst2.32 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #4 \n" // 2 uv -> 4 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", + "d30" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 03a798cd4..0ac4e2ea3 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -799,8 +799,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "rshrn v4.8b, v4.8h, #4 \n" // 1, odd "rshrn v3.8b, v5.8h, #4 \n" // 1, even - "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1 - "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2 + "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 + "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv "b.gt 1b \n" : "+r"(src_ptr), // %0 @@ -816,6 +816,106 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ); } +void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, + int dst_width) { + const uint16_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8h, #3 \n" + + "1: \n" + "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) + "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) + "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) + + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) + "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) + "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) + + "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) + "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store + "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, + ptrdiff_t src_stride, + uint16_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint16_t* src_ptr1 = src_ptr + src_stride; + uint16_t* dst_ptr1 = dst_ptr + dst_stride; + const uint16_t* src_temp = src_ptr + 2; + const uint16_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.4h, #3 \n" + "movi v30.4s, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) + "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) + "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) + "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) + "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) + + "mov v0.4s, v4.4s \n" + "mov v1.4s, v5.4s \n" + "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) + "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) + "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) + "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) + + "rshrn v1.4h, v2.4s, #4 \n" // 2, odd + "rshrn v0.4h, v3.4s, #4 \n" // 2, even + "rshrn v3.4h, v4.4s, #4 \n" // 1, odd + "rshrn v2.4h, v5.4s, #4 \n" // 1, even + + "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 + "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 + "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 003ad2a17..7baeae6d7 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -741,23 +741,124 @@ void ScaleUVBilinearUp2(int src_width, } #endif - if (src_height == 1) { - Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); - } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO: Test performance of writing one row of destination at a time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); - dst_ptr += dst_stride; - for (x = 0; x < src_height - 1; ++x) { - Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); - src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. - dst_ptr += 2 * dst_stride; - } - if (!(dst_height & 1)) { - Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + +// Scale 16 bit UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of P210 to P410. +void ScaleUVLinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_uv, + uint16_t* dst_uv) { + void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_16_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; } } } +// Scale 16 bit UV, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of P010 to P410. +void ScaleUVBilinearUp2_16(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint16_t* src_ptr, + uint16_t* dst_ptr) { + void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, + uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_16_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 + if (TestCpuFlag(kCpuHasSSE2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; + } +#endif + + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO: Test performance of writing one row of destination at a time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } +} + // Scale UV to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and @@ -851,6 +952,26 @@ static int UVCopy(const uint8_t* src_UV, CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height); return 0; } + +static int UVCopy_16(const uint16_t* src_UV, + int src_stride_UV, + uint16_t* dst_UV, + int dst_stride_UV, + int width, + int height) { + if (!src_UV || !dst_UV || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_UV = src_UV + (height - 1) * src_stride_UV; + src_stride_UV = -src_stride_UV; + } + + CopyPlane_16(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height); + return 0; +} #endif // HAS_UVCOPY // Scale a UV plane (from NV12) @@ -953,7 +1074,7 @@ static void ScaleUV(const uint8_t* src, dst_stride, src, dst, x, y, dy, 4, filtering); return; } - if (filtering && src_height == dst_height) { + if (filtering && (dst_width + 1) / 2 == src_width) { ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst); return; @@ -1005,6 +1126,69 @@ int UVScale(const uint8_t* src_uv, return 0; } +// Scale an 16 bit UV image. +// This function is currently incomplete, it can't handle all cases. +LIBYUV_API +int UVScale_16(const uint16_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint16_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int dy = 0; + + if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src_uv = src_uv + (src_height - 1) * src_stride_uv; + src_stride_uv = -src_stride_uv; + } + src_width = Abs(src_width); + +#ifdef HAS_UVCOPY + if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { + if (dst_height == 1) { + UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv, + dst_uv, dst_stride_uv, dst_width, dst_height); + } else { + dy = src_height / dst_height; + UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy, + dst_uv, dst_stride_uv, dst_width, dst_height); + } + + return 0; + } +#endif + + if (filtering && (dst_width + 1) / 2 == src_width) { + ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height, + src_stride_uv, dst_stride_uv, src_uv, dst_uv); + return 0; + } + + return -1; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 18b910e58..2922fc405 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -377,89 +377,119 @@ TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2) TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2) TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) -#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \ - OFF, DOY) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_uv_c, 2, \ - 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_uv_opt, 102, \ - 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \ - dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_uv + OFF, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \ - kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, \ - NEG kHeight); \ - } \ - if (DOY) { \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ - } \ - } \ - } \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ - dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ - } \ - } \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ +#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ + DOY, SRC_DEPTH) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ + static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ + static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ + "DST SRC_SUBSAMP_X unsupported"); \ + static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ + "DST SRC_SUBSAMP_Y unsupported"); \ + static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ + "DST DST_SUBSAMP_X unsupported"); \ + static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ + "DST DST_SUBSAMP_Y unsupported"); \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ + const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ + const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ + const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ + align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_uv, \ + 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_c, \ + 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_uv_opt, \ + 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ + MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ + SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ + SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ + } \ + for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) { \ + src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1); \ + } \ + memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ + DOY ? reinterpret_cast(dst_y_c) : NULL, kWidth, \ + reinterpret_cast(dst_uv_c), 2 * kDstHalfWidth, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ + DOY ? reinterpret_cast(dst_y_opt) : NULL, kWidth, \ + reinterpret_cast(dst_uv_opt), 2 * kDstHalfWidth, \ + kWidth, NEG kHeight); \ + } \ + if (DOY) { \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ + } \ + } \ + } \ + for (int i = 0; i < kDstHalfHeight; ++i) { \ + for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \ + EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \ + dst_uv_opt[i * 2 * kDstHalfWidth + j]); \ + } \ + } \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ } -#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \ - 1) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \ - TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ - SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) +#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ + DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1, \ + SRC_DEPTH) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, 1, \ + SRC_DEPTH) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \ + SRC_DEPTH) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ + SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ + DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \ + SRC_DEPTH) -TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) -TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2) -TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1) -TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1) +TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) +TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8) +TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8) +TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8) +// These formats put data at high bits, so test on full 16bit range. +TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16) +TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16) +TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16) +TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16) +TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16) +TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \