From e647902212ddce08d004bc34cf9720ac960b5d54 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 29 Sep 2020 15:49:57 -0700 Subject: [PATCH] NV12Scale function and ScaleUV for packed UV plane bilinear scaling Bug: libyuv:718, libyuv:838, b/168918847 Change-Id: I3300c1e7d51407b9c3201cf52b68e2e11346ff5f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2427868 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- Android.bp | 2 + Android.mk | 2 + BUILD.gn | 3 + include/libyuv/scale_row.h | 225 ++++++++++ include/libyuv/scale_uv.h | 38 ++ linux.mk | 1 + source/scale.cc | 80 ++-- source/scale_argb.cc | 2 +- source/scale_common.cc | 231 ++++++++++ source/scale_uv.cc | 832 +++++++++++++++++++++++++++++++++++++ unit_test/scale_test.cc | 15 +- unit_test/scale_uv_test.cc | 146 +++++++ 12 files changed, 1518 insertions(+), 59 deletions(-) create mode 100644 include/libyuv/scale_uv.h create mode 100644 source/scale_uv.cc create mode 100644 unit_test/scale_uv_test.cc diff --git a/Android.bp b/Android.bp index 73b77a326..06eec1003 100644 --- a/Android.bp +++ b/Android.bp @@ -40,6 +40,7 @@ cc_library { "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", + "source/scale_uv.cc", "source/scale_common.cc", "source/scale_gcc.cc", "source/scale_mmi.cc", @@ -91,6 +92,7 @@ cc_test { "unit_test/rotate_argb_test.cc", "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", + "unit_test/scale_uv_test.cc", "unit_test/scale_test.cc", "unit_test/video_common_test.cc", ], diff --git a/Android.mk b/Android.mk index 08f990af6..47fbfb653 100644 --- a/Android.mk +++ b/Android.mk @@ -42,6 +42,7 @@ LOCAL_SRC_FILES := \ source/row_win.cc \ source/scale_any.cc \ source/scale_argb.cc \ + source/scale_uv.cc \ source/scale.cc \ source/scale_common.cc \ source/scale_gcc.cc \ @@ -101,6 +102,7 @@ LOCAL_SRC_FILES := \ unit_test/rotate_argb_test.cc \ unit_test/rotate_test.cc \ unit_test/scale_argb_test.cc \ + unit_test/scale_uv_test.cc \ unit_test/scale_test.cc \ unit_test/video_common_test.cc diff --git a/BUILD.gn b/BUILD.gn index 8cd334b40..28a2a1a0d 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -97,6 +97,7 @@ static_library("libyuv_internal") { "include/libyuv/row.h", "include/libyuv/scale.h", "include/libyuv/scale_argb.h", + "include/libyuv/scale_uv.h", "include/libyuv/scale_row.h", "include/libyuv/version.h", "include/libyuv/video_common.h", @@ -130,6 +131,7 @@ static_library("libyuv_internal") { "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", + "source/scale_uv.cc", "source/scale_common.cc", "source/scale_gcc.cc", "source/scale_win.cc", @@ -278,6 +280,7 @@ if (libyuv_include_tests) { "unit_test/rotate_argb_test.cc", "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", + "unit_test/scale_uv_test.cc", "unit_test/scale_test.cc", "unit_test/unit_test.cc", "unit_test/unit_test.h", diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index dd20718a8..776dc383d 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -376,6 +376,53 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb, int dst_width, int x32, int dx); +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx); +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx); +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int, + int); +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx); +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx); // Specialized scalers for x86. void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, @@ -782,6 +829,184 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +// UV Row functions +void ScaleUVRowDown2_SSE2(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_SSE2(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_SSE2(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleUVRowDown2Linear_NEON(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleUVRowDown2_MSA(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_MSA(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_MSA(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_MMI(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Linear_MMI(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2Box_MMI(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDown2_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_SSE2(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_SSE2(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_NEON(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_MSA(const uint8_t* src_uv, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_MMI(const uint8_t* src_uv, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width); +void ScaleUVRowDownEven_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); + // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. diff --git a/include/libyuv/scale_uv.h b/include/libyuv/scale_uv.h new file mode 100644 index 000000000..1b6327aae --- /dev/null +++ b/include/libyuv/scale_uv.h @@ -0,0 +1,38 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_UV_H_ +#define INCLUDE_LIBYUV_SCALE_UV_H_ + +#include "libyuv/basic_types.h" +#include "libyuv/scale.h" // For FilterMode + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_UV_H_ diff --git a/linux.mk b/linux.mk index 6b07c5060..3244b7d46 100644 --- a/linux.mk +++ b/linux.mk @@ -49,6 +49,7 @@ LOCAL_OBJ_FILES := \ source/row_win.o \ source/scale_any.o \ source/scale_argb.o \ + source/scale_uv.o \ source/scale.o \ source/scale_common.o \ source/scale_gcc.o \ diff --git a/source/scale.cc b/source/scale.cc index d26bfec7e..cf3c03325 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -17,6 +17,7 @@ #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" #include "libyuv/scale_row.h" +#include "libyuv/scale_uv.h" // For UVScale #ifdef __cplusplus namespace libyuv { @@ -1872,63 +1873,34 @@ int I444Scale_16(const uint16_t* src_y, // Scale an NV12 image. // This function in turn calls a scaling function for each plane. -// TODO(https://bugs.chromium.org/p/libyuv/issues/detail?id=838): Remove -// this once libyuv implements NV12Scale and use the libyuv::NV12Scale(). -// This is copy-pasted from -// webrtc/common_video/libyuv/include/webrtc_libyuv.h +LIBYUV_API int NV12Scale(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - int src_width, - int src_height, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int dst_width, - int dst_height, - enum FilterMode filtering) { - const int src_chroma_width = (src_width + 1) / 2; - const int src_chroma_height = (src_height + 1) / 2; - - if (src_width == dst_width && src_height == dst_height) { - // No scaling. - libyuv::CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, src_width, - src_height); - libyuv::CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, - src_chroma_width * 2, src_chroma_height); - return 0; + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); + if (!src_y || !src_uv || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || + dst_width <= 0 || dst_height <= 0) { + return -1; } - // Scaling. - // Allocate temporary memory for spitting UV planes and scaling them. - const int dst_chroma_width = (dst_width + 1) / 2; - const int dst_chroma_height = (dst_height + 1) / 2; - - align_buffer_64(tmp_buffer, - src_chroma_width * src_chroma_height * 2 + - dst_chroma_width * dst_chroma_height * 2); - - uint8_t* const src_u = tmp_buffer; - uint8_t* const src_v = src_u + src_chroma_width * src_chroma_height; - uint8_t* const dst_u = src_v + src_chroma_width * src_chroma_height; - uint8_t* const dst_v = dst_u + dst_chroma_width * dst_chroma_height; - - // Split source UV plane into separate U and V plane using the temporary data. - libyuv::SplitUVPlane(src_uv, src_stride_uv, src_u, src_chroma_width, src_v, - src_chroma_width, src_chroma_width, src_chroma_height); - - // Scale the planes. - libyuv::I420Scale( - src_y, src_stride_y, src_u, src_chroma_width, src_v, src_chroma_width, - src_width, src_height, dst_y, dst_stride_y, dst_u, dst_chroma_width, - dst_v, dst_chroma_width, dst_width, dst_height, filtering); - - // Merge the UV planes into the destination. - libyuv::MergeUVPlane(dst_u, dst_chroma_width, dst_v, dst_chroma_width, dst_uv, - dst_stride_uv, dst_chroma_width, dst_chroma_height); - free_aligned_buffer_64(tmp_buffer); + ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, + dst_width, dst_height, filtering); + UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, + dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); return 0; } diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 411a35a8f..451d4ec4d 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -981,7 +981,7 @@ static void ScaleARGB(const uint8_t* src, } } if (dx == 0x10000 && (x & 0xffff) == 0) { - // Arbitrary scale vertically, but unscaled vertically. + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, y, dy, 4, filtering); return; diff --git a/source/scale_common.cc b/source/scale_common.cc index bc167bdc8..fd4cbd038 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -776,6 +776,8 @@ void ScaleAddRow_16_C(const uint16_t* src_ptr, } } +// ARGB scale row functions + void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -1018,6 +1020,235 @@ void ScaleARGBFilterCols64_C(uint8_t* dst_argb, #undef BLENDERC #undef BLENDER +// UV scale row functions +// same as ARGB but 2 channels + +void ScaleUVRowDown2_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int x; + (void)src_stride; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + (void)src_stride; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; + dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDown2Box_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += 4; + dst_uv += 2; + } +} + +void ScaleUVRowDownEven_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + (void)src_stride; + int x; + for (x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_uv, + int dst_width) { + int x; + for (x = 0; x < dst_width; ++x) { + dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + + src_uv[src_stride + 2] + 2) >> + 2; + dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + + src_uv[src_stride + 3] + 2) >> + 2; + src_uv += src_stepx * 2; + dst_uv += 2; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleUVCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +void ScaleUVCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleUVColsUp2_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + (void)x; + (void)dx; + for (j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDERC(a, b, f, s) \ + (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleUVFilterCols_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x, + int dx) { + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} + +void ScaleUVFilterCols64_C(uint8_t* dst_uv, + const uint8_t* src_uv, + int dst_width, + int x32, + int dx) { + int64_t x = (int64_t)(x32); + const uint16_t* src = (const uint16_t*)(src_uv); + uint16_t* dst = (uint16_t*)(dst_uv); + int j; + for (j = 0; j < dst_width - 1; j += 2) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int64_t xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint16_t a = src[xi]; + uint16_t b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, diff --git a/source/scale_uv.cc b/source/scale_uv.cc new file mode 100644 index 000000000..99742a6db --- /dev/null +++ b/source/scale_uv.cc @@ -0,0 +1,832 @@ +/* + * Copyright 2020 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include +#include + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyUV +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Macros to enable specialized scalers +#define HAS_SCALEUVDOWN2 1 +#define HAS_SCALEUVDOWN4BOX 1 +#define HAS_SCALEUVDOWNEVEN 1 +#define HAS_SCALEUVBILINEARDOWN 1 +#define HAS_SCALEUVBILINEARUP 1 +#define HAS_UVCOPY 1 +#define HAS_SCALEPLANEVERTICAL 1 + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// ScaleUV, 1/2 +// This is an optimized version for scaling down a UV to 1/2 of +// its original size. +#ifdef HAS_SCALEUVDOWN2 +static void ScaleUVDown2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + filtering == kFilterNone + ? ScaleUVRowDown2_C + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C + : ScaleUVRowDown2Box_C); + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 2); // Test scale factor of 2. + assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; + } else { + src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2; + } + +#if defined(HAS_SCALEUVROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_SSE2 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSE2 + : ScaleUVRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_SSE2 + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSE2 + : ScaleUVRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON + : ScaleUVRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_NEON + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON + : ScaleUVRowDown2Box_NEON); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MMI + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI + : ScaleUVRowDown2Box_Any_MMI); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MMI + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI + : ScaleUVRowDown2Box_MMI); + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA + : ScaleUVRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDown2 = + filtering == kFilterNone + ? ScaleUVRowDown2_MSA + : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA + : ScaleUVRowDown2Box_MSA); + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif // HAS_SCALEUVDOWN2 + +// ScaleUV, 1/4 +// This is an optimized version for scaling down a UV to 1/4 of +// its original size. +#ifdef HAS_SCALEUVDOWN4BOX +static void ScaleUVDown4Box(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + int row_stride = src_stride * (dy >> 16); + void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, + uint8_t* dst_uv, int dst_width) = + ScaleUVRowDown2Box_C; + // Advance to odd row, even column. + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; + (void)src_width; + (void)src_height; + (void)dx; + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. +#if defined(HAS_SCALEUVROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_SSE2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; + } + } +#endif + + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); + ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, + dst_width * 2); + ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } + free_aligned_buffer_64(row); +} +#endif // HAS_SCALEUVDOWN4BOX + +// ScaleUV Even +// This is an optimized version for scaling down a UV to even +// multiple of its original size. +#ifdef HAS_SCALEUVDOWNEVEN +static void ScaleUVDownEven(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, + int src_step, uint8_t* dst_uv, int dst_width) = + filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; + (void)src_width; + (void)src_height; + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + src_uv += (y >> 16) * src_stride + (x >> 16) * 2; +#if defined(HAS_SCALEUVROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSE2 + : ScaleUVRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSE2; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON + : ScaleUVRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI; + } + } +#endif +#if defined(HAS_SCALEUVROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVRowDownEven = + filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; + } + } +#endif + + if (filtering == kFilterLinear) { + src_stride = 0; + } + for (j = 0; j < dst_height; ++j) { + ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); + src_uv += row_stride; + dst_uv += dst_stride; + } +} +#endif + +// Scale UV down with bilinear interpolation. +#ifdef HAS_SCALEUVBILINEARDOWN +static void ScaleUVBilinearDown(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; + int64_t xlast = x + (int64_t)(dst_width - 1) * dx; + int64_t xl = (dx >= 0) ? x : xlast; + int64_t xr = (dx >= 0) ? xlast : x; + int clip_src_width; + xl = (xl >> 16) & ~3; // Left edge aligned. + xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. + xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. + if (xr > src_width) { + xr = src_width; + } + clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. + src_uv += xl * 2; + x -= (int)(xl << 16); +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. + // Allocate a row of UV. + { + align_buffer_64(row, clip_src_width * 2); + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + for (j = 0; j < dst_height; ++j) { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); + } + dst_uv += dst_stride; + y += dy; + if (y > max_y) { + y = max_y; + } + } + free_aligned_buffer_64(row); + } +} +#endif + +// Scale UV up with bilinear interpolation. +#ifdef HAS_SCALEUVBILINEARUP +static void ScaleUVBilinearUp(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy, + enum FilterMode filtering) { + int j; + void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, + int dst_width, int x, int dx) = + filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; + const int max_y = (src_height - 1) << 16; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + InterpolateRow = InterpolateRow_Any_MMI; + if (IS_ALIGNED(dst_width, 2)) { + InterpolateRow = InterpolateRow_MMI; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + InterpolateRow = InterpolateRow_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_MSA; + } + } +#endif + if (src_width >= 32768) { + ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; + } +#if defined(HAS_SCALEUVFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVFilterCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVFILTERCOLS_MSA) + if (filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVFilterCols_MSA; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleUVFilterCols = ScaleUVCols_SSE2; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleUVFilterCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MMI) + if (!filtering && TestCpuFlag(kCpuHasMMI)) { + ScaleUVFilterCols = ScaleUVCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleUVFilterCols = ScaleUVCols_MMI; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (!filtering && TestCpuFlag(kCpuHasMSA)) { + ScaleUVFilterCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVCols_MSA; + } + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleUVFilterCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleUVFilterCols = ScaleUVColsUp2_SSE2; + } +#endif +#if defined(HAS_SCALEUVCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleUVFilterCols = ScaleUVColsUp2_MMI; + } +#endif + } + + if (y > max_y) { + y = max_y; + } + + { + int yi = y >> 16; + const uint8_t* src = src_uv + yi * src_stride; + + // Allocate 2 rows of UV. + const int kRowSize = (dst_width * 2 + 15) & ~15; + align_buffer_64(row, kRowSize * 2); + + uint8_t* rowptr = row; + int rowstride = kRowSize; + int lasty = yi; + + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_uv + yi * src_stride; + } + if (yi != lasty) { + ScaleUVFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); + } + dst_uv += dst_stride; + y += dy; + } + free_aligned_buffer_64(row); + } +} +#endif // HAS_SCALEUVBILINEARUP + +// Scale UV to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + +static void ScaleUVSimple(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv, + int x, + int dx, + int y, + int dy) { + int j; + void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, + int x, int dx) = + (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; + (void)src_height; +#if defined(HAS_SCALEUVCOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { + ScaleUVCols = ScaleUVCols_SSE2; + } +#endif +#if defined(HAS_SCALEUVCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleUVCols = ScaleUVCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVCols_NEON; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ScaleUVCols = ScaleUVCols_Any_MMI; + if (IS_ALIGNED(dst_width, 1)) { + ScaleUVCols = ScaleUVCols_MMI; + } + } +#endif +#if defined(HAS_SCALEUVCOLS_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleUVCols = ScaleUVCols_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVCols_MSA; + } + } +#endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleUVCols = ScaleUVColsUp2_C; +#if defined(HAS_SCALEUVCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { + ScaleUVCols = ScaleUVColsUp2_SSE2; + } +#endif +#if defined(HAS_SCALEUVCOLSUP2_MMI) + if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { + ScaleUVCols = ScaleUVColsUp2_MMI; + } +#endif + } + + for (j = 0; j < dst_height; ++j) { + ScaleUVCols(dst_uv, src_uv + (y >> 16) * src_stride, dst_width, x, dx); + dst_uv += dst_stride; + y += dy; + } +} + +// Copy UV with optional flipping +#ifdef HAS_UVCOPY +static int UVCopy(const uint8_t* src_UV, + int src_stride_UV, + uint8_t* dst_UV, + int dst_stride_UV, + int width, + int height) { + if (!src_UV || !dst_UV || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_UV = src_UV + (height - 1) * src_stride_UV; + src_stride_UV = -src_stride_UV; + } + + CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height); + return 0; +} +#endif // HAS_UVCOPY + +// Scale a UV plane (from NV12) +// This function in turn calls a scaling function +// suitable for handling the desired resolutions. +static void ScaleUV(const uint8_t* src, + int src_stride, + int src_width, + int src_height, + uint8_t* dst, + int dst_stride, + int dst_width, + int dst_height, + int clip_x, + int clip_y, + int clip_width, + int clip_height, + enum FilterMode filtering) { + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + // UV does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, + filtering); + + // Negative src_height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, + &dx, &dy); + src_width = Abs(src_width); + if (clip_x) { + int64_t clipf = (int64_t)(clip_x)*dx; + x += (clipf & 0xffff); + src += (clipf >> 16) * 2; + dst += clip_x * 2; + } + if (clip_y) { + int64_t clipf = (int64_t)(clip_y)*dy; + y += (clipf & 0xffff); + src += (clipf >> 16) * src_stride; + dst += clip_y * dst_stride; + } + + // Special case for integer step values. + if (((dx | dy) & 0xffff) == 0) { + if (!dx || !dy) { // 1 pixel wide and/or tall. + filtering = kFilterNone; + } else { + // Optimized even scale down. ie 2, 4, 6, 8, 10x. + if (!(dx & 0x10000) && !(dy & 0x10000)) { +#ifdef HAS_SCALEUVDOWN2 + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleUVDown2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#ifdef HAS_SCALEUVDOWN4BOX + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy); + return; + } +#endif +#ifdef HAS_SCALEUVDOWNEVEN + ScaleUVDownEven(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; +#endif + } + // Optimized odd scale down. ie 3, 5, 7, 9x. + if ((dx & 0x10000) && (dy & 0x10000)) { + filtering = kFilterNone; +#ifdef HAS_UVCOPY + if (dx == 0x10000 && dy == 0x10000) { + // Straight copy. + UVCopy(src + (y >> 16) * src_stride + (x >> 16) * 2, src_stride, dst, + dst_stride, clip_width, clip_height); + return; + } +#endif + } + } + } + // HAS_SCALEPLANEVERTICAL + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled horizontally. + ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, y, dy, 4, filtering); + return; + } + +#ifdef HAS_SCALEUVBILINEARUP + if (filtering && dy < 65536) { + ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif +#ifdef HAS_SCALEUVBILINEARDOWN + if (filtering) { + ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, x, dx, y, dy, + filtering); + return; + } +#endif + ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst, x, dx, y, dy); +} + +// Scale an UV image. +LIBYUV_API +int UVScale(const uint8_t* src_uv, + int src_stride_uv, + int src_width, + int src_height, + uint8_t* dst_uv, + int dst_stride_uv, + int dst_width, + int dst_height, + enum FilterMode filtering) { + if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 || + src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { + return -1; + } + ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, + dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); + return 0; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 86ac81647..9801b9d78 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -700,6 +700,12 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ + TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ + int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \ int diff = I420TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ @@ -730,10 +736,11 @@ TEST_FACTOR(3, 1, 3, 0) benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ - int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ - height, kFilter##filter, benchmark_iterations_, \ - disable_cpu_flags_, benchmark_cpu_info_); \ + TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \ + int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } diff --git a/unit_test/scale_uv_test.cc b/unit_test/scale_uv_test.cc new file mode 100644 index 000000000..c8b57d174 --- /dev/null +++ b/unit_test/scale_uv_test.cc @@ -0,0 +1,146 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "../unit_test/unit_test.h" +#include "libyuv/cpu_id.h" +#include "libyuv/scale_uv.h" +#include "libyuv/video_common.h" + +namespace libyuv { + +#define STRINGIZE(line) #line +#define FILELINESTR(file, line) file ":" STRINGIZE(line) + +// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. +static int UVTestFilter(int src_width, + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info) { + if (!SizeValid(src_width, src_height, dst_width, dst_height)) { + return 0; + } + + int i, j; + const int b = 0; // 128 to test for padding/stride. + int64_t src_uv_plane_size = + (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL; + int src_stride_uv = (b * 2 + Abs(src_width)) * 2; + + align_buffer_page_end(src_uv, src_uv_plane_size); + if (!src_uv) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + MemRandomize(src_uv, src_uv_plane_size); + + int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL; + int dst_stride_uv = (b * 2 + dst_width) * 2; + + align_buffer_page_end(dst_uv_c, dst_uv_plane_size); + align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); + if (!dst_uv_c || !dst_uv_opt) { + printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); + return 0; + } + memset(dst_uv_c, 2, dst_uv_plane_size); + memset(dst_uv_opt, 3, dst_uv_plane_size); + + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, + src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, + dst_width, dst_height, f); + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, + src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, + dst_width, dst_height, f); + + MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. + double c_time = get_time(); + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, + src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, + dst_width, dst_height, f); + + c_time = (get_time() - c_time); + + MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. + double opt_time = get_time(); + for (i = 0; i < benchmark_iterations; ++i) { + UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, + src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, + dst_width, dst_height, f); + } + opt_time = (get_time() - opt_time) / benchmark_iterations; + + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", f, + static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); + + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference isn't + // over 2. + int max_diff = 0; + for (i = b; i < (dst_height + b); ++i) { + for (j = b * 2; j < (dst_width + b) * 2; ++j) { + int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - + dst_uv_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + } + + free_aligned_buffer_page_end(dst_uv_c); + free_aligned_buffer_page_end(dst_uv_opt); + free_aligned_buffer_page_end(src_uv); + return max_diff; +} + +#define TEST_SCALETO1(name, width, height, filter, max_diff) \ + TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ + int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \ + height, kFilter##filter, benchmark_iterations_, \ + disable_cpu_flags_, benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ + int diff = UVTestFilter(width, height, Abs(benchmark_width_), \ + Abs(benchmark_height_), kFilter##filter, \ + benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } + +/// Test scale to a specified size with all 3 filters. +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(name, width, height, None, 0) \ + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) + +TEST_SCALETO(UVScale, 1, 1) +TEST_SCALETO(UVScale, 320, 240) +TEST_SCALETO(UVScale, 569, 480) +TEST_SCALETO(UVScale, 640, 360) +#ifdef ENABLE_SLOW_TESTS +TEST_SCALETO(UVScale, 1280, 720) +TEST_SCALETO(UVScale, 1920, 1080) +#endif // ENABLE_SLOW_TESTS +#undef TEST_SCALETO1 +#undef TEST_SCALETO + +} // namespace libyuv