From 26277baf96fd95bf6efa4abab82775bde9bc5ccb Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 21 Jan 2025 15:56:56 -0800 Subject: [PATCH] J420ToI420 using planar 8 bit scaling - Add Convert8To8Plane which scale and add 8 bit values allowing full range YUV to be converted to limited range YUV libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Samsung S23 J420ToI420_Opt (45 ms) I420ToI420_Opt (37 ms) Skylake J420ToI420_Opt (596 ms) I420ToI420_Opt (99 ms) Bug: 381327032 Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631 Reviewed-by: Wan-Teh Chang --- README.chromium | 2 +- include/libyuv/convert.h | 17 +++ include/libyuv/planar_functions.h | 10 ++ include/libyuv/row.h | 17 +++ include/libyuv/version.h | 2 +- source/convert.cc | 72 +++++++++++++ source/cpu_id.cc | 2 +- source/planar_functions.cc | 75 +++++++++++++ source/row_any.cc | 28 +++++ source/row_common.cc | 18 ++++ source/row_neon64.cc | 170 ++++++++++++++++++------------ source/scale.cc | 54 +++++----- source/scale_neon64.cc | 3 +- unit_test/convert_test.cc | 23 ++++ unit_test/planar_test.cc | 31 ++++++ 15 files changed, 428 insertions(+), 96 deletions(-) diff --git a/README.chromium b/README.chromium index 5442e7f7c..18e76cc59 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1899 +Version: 1900 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 5c7669b5d..79dcf0555 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y, int width, int height); +// Convert J420 to I420. +LIBYUV_API +int J420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // Convert I400 (grey) to NV21. LIBYUV_API int I400ToNV21(const uint8_t* src_y, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 678074a14..5b79efffc 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y, int width, int height); +LIBYUV_API +void Convert8To8Plane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 220 for Y, 225 for U,V + int bias, // 16 + int width, + int height); + // Set a plane of data to a 32 bit value. LIBYUV_API void SetPlane(uint8_t* dst_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fee2d2481..815ac6a5a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -507,6 +507,7 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_CONVERT8TO8ROW_NEON #define HAS_ARGBTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON #define HAS_I210ALPHATOARGBROW_NEON @@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y, int scale, int width); +void Convert8To8Row_C(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); +void Convert8To8Row_NEON(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width); +void Convert8To8Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int scale, + int bias, + int width); + void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 842fe201f..adf3e8538 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1899 +#define LIBYUV_VERSION 1900 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 665f0d23d..0c974f5ff 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y, 1, 1, 16); } +static int Planar8bitTo8bit(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height, + int subsample_x, + int subsample_y, + int scale_y, + int bias_y, + int scale_uv, + int bias_uv) { + int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); + int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); + if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + uv_height = -uv_height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (uv_height - 1) * src_stride_u; + src_v = src_v + (uv_height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + + // Convert Y plane. + if (dst_y) { + Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y, + width, height); + } + // Convert UV planes. + Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv, + uv_width, uv_height); + Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv, + uv_width, uv_height); + return 0; +} + +LIBYUV_API +int J420ToI420(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_y, dst_stride_y, dst_u, + dst_stride_u, dst_v, dst_stride_v, width, height, 1, + 1, 220, 16, 225, 16); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/cpu_id.cc b/source/cpu_id.cc index f5cc968fb..e4acbecf4 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) { ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) | ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0); if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) { - cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0; + cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0; } } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 4c87b7d3d..f0763c41f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y, } } +// Convert a plane of 8 bit data to 8 bit +LIBYUV_API +void Convert8To8Plane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int scale, // 220 for Y, 225 to UV + int bias, // 16 + int width, + int height) { + int y; + void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale, + int bias, int width) = Convert8To8Row_C; + + if (width <= 0 || height == 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_stride_y; + dst_stride_y = -dst_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && dst_stride_y == width) { + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; + } +#if defined(HAS_CONVERT8TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert8To8Row = Convert8To8Row_Any_NEON; + if (IS_ALIGNED(width, 32)) { + Convert8To8Row = Convert8To8Row_NEON; + } + } +#endif +#if defined(HAS_CONVERT8TO8ROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + Convert8To8Row = Convert8To8Row_SME; + } +#endif +#if defined(HAS_CONVERT8TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert8To8Row = Convert8To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + Convert8To8Row = Convert8To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT8TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert8To8Row = Convert8To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert8To8Row = Convert8To8Row_AVX2; + } + } +#endif +#if defined(HAS_CONVERT8TO8ROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + Convert8To8Row = Convert8To8Row_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + Convert8To8Row = Convert8To8Row_AVX512BW; + } + } +#endif + + // Convert plane + for (y = 0; y < height; ++y) { + Convert8To8Row(src_y, dst_y, scale, bias, width); + src_y += src_stride_y; + dst_y += dst_stride_y; + } +} + // Copy I422. LIBYUV_API int I422Copy(const uint8_t* src_y, diff --git a/source/row_any.cc b/source/row_any.cc index 70ab046ec..8344aa35f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) #endif #undef ANY11C +// Any 1 to 1 with parameter and shorts. BPP measures in shorts. +#define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \ + int width) { \ + SIMD_ALIGNED(STYPE vin[64]); \ + SIMD_ALIGNED(DTYPE vout[64]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, scale, bias, n); \ + } \ + memcpy(vin, src_ptr + n, r * SBPP); \ + ANY_SIMD(vin, vout, scale, bias, MASK + 1); \ + memcpy(dst_ptr + n, vout, r * BPP); \ + } + +#ifdef HAS_CONVERT8TO8ROW_NEON +ANY11SB(Convert8To8Row_Any_NEON, + Convert8To8Row_NEON, + 1, + 1, + uint8_t, + uint8_t, + 31) +#endif +#undef ANY11B + // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ diff --git a/source/row_common.cc b/source/row_common.cc index 4b5948201..cd16c1721 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y, } } +// Use scale to convert J420 to I420 +// scale parameter is 8.8 fixed point but limited to 0 to 255 +// Function is based on DivideRow, but adds a bias +// Does not clamp +void Convert8To8Row_C(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + int x; + assert(scale >= 0); + assert(scale <= 255); + + for (x = 0; x < width; ++x) { + dst_y[x] = ((src_y[x] * scale) >> 8) + bias; + } +} + void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { memcpy(dst, src, count); } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index a8ba41357..dc4ca2417 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm volatile( - YUVTORGB_SETUP + asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" "1: \n" READYUV210 NVTORGB "subs %w[width], %w[width], #8 \n" STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [alpha] "r"(alpha) // %[alpha] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [alpha] "r"(alpha) // %[alpha] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void I410ToAR30Row_NEON(const uint16_t* src_y, @@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm volatile( - YUVTORGB_SETUP + asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "dup v23.8h, %w[alpha] \n" "1: \n" READYUV410 NVTORGB "subs %w[width], %w[width], #8 \n" STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [alpha] "r"(alpha) // %[alpha] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [alpha] "r"(alpha) // %[alpha] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void I212ToAR30Row_NEON(const uint16_t* src_y, @@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; - asm volatile( - YUVTORGB_SETUP + asm volatile(YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "1: \n" READYUV212 NVTORGB "subs %w[width], %w[width], #8 \n" STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_u] "+r"(src_u), // %[src_u] - [src_v] "+r"(src_v), // %[src_v] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit) // %[limit] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_u] "+r"(src_u), // %[src_u] + [src_v] "+r"(src_v), // %[src_v] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit) // %[limit] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void I210ToARGBRow_NEON(const uint16_t* src_y, @@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV210 NVTORGB RGBTORGB8 + "1: \n" READYUV210 NVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV410 NVTORGB RGBTORGB8 + "1: \n" READYUV410 NVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y, asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" - "1: \n" READYUV212 NVTORGB RGBTORGB8 + "1: \n" READYUV212 NVTORGB + RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" @@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; - asm volatile(YUVTORGB_SETUP + asm volatile( + YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" "1: \n" READYUVP210 NVTORGB "subs %w[width], %w[width], #8 \n" STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void P410ToAR30Row_NEON(const uint16_t* src_y, @@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; - asm volatile(YUVTORGB_SETUP + asm volatile( + YUVTORGB_SETUP "dup v22.8h, %w[limit] \n" "movi v23.8h, #0xc0, lsl #8 \n" // A "ldr q2, [%[kIndices]] \n" "1: \n" READYUVP410 NVTORGB "subs %w[width], %w[width], #8 \n" STOREAR30 "b.gt 1b \n" - : [src_y] "+r"(src_y), // %[src_y] - [src_uv] "+r"(src_uv), // %[src_uv] - [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] - [width] "+r"(width) // %[width] - : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] - [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] - [limit] "r"(limit), // %[limit] - [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] - : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); + : [src_y] "+r"(src_y), // %[src_y] + [src_uv] "+r"(src_uv), // %[src_uv] + [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] + [width] "+r"(width) // %[width] + : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] + [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] + [limit] "r"(limit), // %[limit] + [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] + : "cc", "memory", YUVTORGB_REGS, "v22", "v23"); } void I422ToAR30Row_NEON(const uint8_t* src_y, @@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, READYUV422 I4XXTORGB RGBTORGB8_TOP "subs %w[width], %w[width], #8 \n" // ARGBTOARGB1555_FROM_TOP - "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. + "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels + // RGB1555. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "movi v6.16b, #66 \n" // R * 0.2578 coefficient "movi v7.16b, #16 \n" // Add 16 constant "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 + // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. RGB555TOARGB "umull v16.8h, v0.8b, v4.8b \n" // B @@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "movi v26.16b, #66 \n" // R * 0.2578 coefficient "movi v27.16b, #16 \n" // Add 16 constant "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 + // pixels. "subs %w2, %w2, #16 \n" // 16 processed per loop. ARGB4444TORGB "umull v16.8h, v0.8b, v24.8b \n" // B @@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, uint32_t value) { asm volatile( "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. + "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b + // aarrggbbaarrggbb. "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. @@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop "mul v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" "stp q0, q1, [%1], #32 \n" // store 16 pixels - "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 @@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, "dup v4.8h, %w3 \n" "1: \n" "ldp q2, q3, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop "umull v0.4s, v2.4h, v4.4h \n" "umull2 v1.4s, v2.8h, v4.8h \n" "umull v2.4s, v3.4h, v4.4h \n" @@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y, "uzp2 v0.8h, v0.8h, v1.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n" "stp q0, q1, [%1], #32 \n" // store 16 pixels - "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 @@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y, "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop "uqshl v0.8h, v0.8h, v2.8h \n" "uqshl v1.8h, v1.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "uzp2 v0.16b, v0.16b, v1.16b \n" - "subs %w2, %w2, #16 \n" // 16 src pixels per loop "str q0, [%1], #16 \n" // store 16 pixels "b.gt 1b \n" : "+r"(src_y), // %0 @@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y, : "cc", "memory", "v0", "v1", "v2"); } +// Use scale to convert J420 to I420 +// scale parameter is 8.8 fixed point but limited to 0 to 255 +// Function is based on DivideRow, but adds a bias +// Does not clamp +void Convert8To8Row_NEON(const uint8_t* src_y, + uint8_t* dst_y, + int scale, + int bias, + int width) { + asm volatile( + "dup v4.16b, %w3 \n" // scale + "dup v5.16b, %w4 \n" // bias + "1: \n" + "ldp q2, q3, [%0], #32 \n" + "subs %w2, %w2, #32 \n" // 32 pixels per loop + "umull v0.8h, v2.8b, v4.8b \n" + "umull2 v1.8h, v2.16b, v4.16b \n" + "umull v2.8h, v3.8b, v4.8b \n" + "umull2 v3.8h, v3.16b, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "uzp2 v0.16b, v0.16b, v1.16b \n" + "uzp2 v1.16b, v2.16b, v3.16b \n" + "add v0.16b, v0.16b, v5.16b \n" // add bias (16) + "add v1.16b, v1.16b, v5.16b \n" + "stp q0, q1, [%1], #32 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale), // %3 + "r"(bias) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/scale.cc b/source/scale.cc index 868a84a28..76379fd6e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y, int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y, int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y, int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y, enum FilterMode filtering) { int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y, enum FilterMode filtering) { int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y, enum FilterMode filtering) { int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y, int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y, int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } @@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y, int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int r; - if (!src_y || !src_u || !src_v || src_width <= 0 || - src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || - !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { + if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || + dst_width <= 0 || dst_height <= 0) { return -1; } diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index de19989fc..848d55416 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, "uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v2.8h, v4.8h, v5.8h \n" "uzp2 v3.8h, v6.8h, v7.8h \n" - "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. + "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per + // iteration. "stp q0, q1, [%[dst_ptr]] \n" "stp q2, q3, [%[dst_ptr], #32] \n" "add %[dst_ptr], %[dst_ptr], #64 \n" diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index ef30b12b5..e9e58d329 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8) +TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12) @@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) { } #endif +TEST_F(LibYUVConvertTest, TestJ420ToI420) { + const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255, + 0, 0, 128, 128, 255, 255}; + const uint8_t src_u[3] = {0, 128, 255}; + const uint8_t src_v[3] = {0, 128, 255}; + uint8_t dst_y[12]; + uint8_t dst_u[3]; + uint8_t dst_v[3]; + ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v, + 3, 6, 2), + 0); + EXPECT_EQ(dst_y[0], 16); + EXPECT_EQ(dst_y[2], 126); + EXPECT_EQ(dst_y[4], 235); + EXPECT_EQ(dst_u[0], 16); + EXPECT_EQ(dst_u[1], 128); + EXPECT_EQ(dst_u[2], 240); + EXPECT_EQ(dst_v[0], 16); + EXPECT_EQ(dst_v[1], 128); + EXPECT_EQ(dst_v[2], 240); +} + #endif // !defined(LEAN_TESTS) } // namespace libyuv diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index ca3cbe769..576696bca 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) { free_aligned_buffer_page_end(dst_pixels_y_c); } +TEST_F(LibYUVPlanarTest, Convert8To8Plane) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_y, kPixels); + align_buffer_page_end(dst_pixels_y_opt, kPixels); + align_buffer_page_end(dst_pixels_y_c, kPixels); + + MemRandomize(src_pixels_y, kPixels); + memset(dst_pixels_y_opt, 0, kPixels); + memset(dst_pixels_y_c, 1, kPixels); + + MaskCpuFlags(disable_cpu_flags_); + Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c, + benchmark_width_, 220, 16, benchmark_width_, + benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt, + benchmark_width_, 220, 16, benchmark_width_, + benchmark_height_); + } + + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); + } + + free_aligned_buffer_page_end(src_pixels_y); + free_aligned_buffer_page_end(dst_pixels_y_opt); + free_aligned_buffer_page_end(dst_pixels_y_c); +} + TEST_F(LibYUVPlanarTest, YUY2ToY) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels * 2);