From f2ac6db694d1e5b0af1d7b05dc431e0e455fe228 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 29 Apr 2026 12:54:49 -0700 Subject: [PATCH] RAWToNV21 using SME, SVE, I8MM or Neon Pixel 9 Now SVE2 2 pass LibYUVConvertTest.RAWToNV21_Opt (364 ms) 31.76% libyuv::ARGBToUVMatrixRow_SVE_SC() 30.38% RAWToARGBRow_SVE2 26.81% ARGBToYMatrixRow_NEON_DotProd 3.26% MergeUVRow_NEON Was NEON 1 pass LibYUVConvertTest.RAWToJNV21_Opt (295 ms) 44.14% RAWToYJRow_NEON 41.91% RAWToUVJRow_NEON 5.11% MergeUVRow_NEON Clang on Intel Skylake clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (301 ms) visual c (row_win) [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (2056 ms) clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (275 ms) visual c [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (365 ms) Bug: libyuv:42280902 Change-Id: Iaba558ebe96ce6b9881ee9335ba72b8aac390cde Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7802432 Commit-Queue: Frank Barchard Reviewed-by: richard winterton Reviewed-by: Dale Curtis --- README.chromium | 2 +- include/libyuv/row.h | 49 +++++++++++++++++++- include/libyuv/version.h | 2 +- source/convert.cc | 26 +++++++++++ source/convert_from_argb.cc | 85 ++++++++++++++++++++++++++++++++--- source/row_any.cc | 6 +++ source/row_neon.cc | 66 +++++++++++++++++++++++++++ source/row_neon64.cc | 82 +++++++++++++++++++++++++++------- source/row_sme.cc | 14 ++++++ source/row_sve.cc | 13 ++++++ source/row_win.cc | 89 +++++++++++++++++++++++++++++++++++++ 11 files changed, 407 insertions(+), 27 deletions(-) diff --git a/README.chromium b/README.chromium index 27d03c752..1407f963e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1933 +Version: 1934 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 3b2c52aaa..40272cf5a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,6 +140,13 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__) || \ + defined(_M_X64) || defined(_M_X86)) +#define HAS_ARGBTOUVMATRIXROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#endif + #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) @@ -163,7 +170,6 @@ extern "C" { #define HAS_I444TORGB24ROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 -#define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 @@ -427,6 +433,7 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON +#define HAS_ARGBTOUVMATRIXROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #if !defined(__aarch64__) @@ -573,6 +580,7 @@ extern "C" { #define HAS_ARGBTOUV444ROW_NEON_I8MM #define HAS_ARGBTOUVJ444ROW_NEON_I8MM #define HAS_ARGBTOUVJROW_NEON_I8MM +#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM #define HAS_ARGBTOUVROW_NEON_I8MM #define HAS_BGRATOUVROW_NEON_I8MM #define HAS_RGBATOUVROW_NEON_I8MM @@ -588,6 +596,7 @@ extern "C" { #define HAS_ARGBTORGB565DITHERROW_SVE2 #define HAS_ARGBTORGB565ROW_SVE2 #define HAS_ARGBTOUVJROW_SVE2 +#define HAS_ARGBTOUVMATRIXROW_SVE2 #define HAS_ARGBTOUVROW_SVE2 #define HAS_AYUVTOUVROW_SVE2 #define HAS_AYUVTOVUROW_SVE2 @@ -639,6 +648,7 @@ extern "C" { #define HAS_ABGRTOUVROW_SME #define HAS_ARGBMULTIPLYROW_SME #define HAS_ARGBTOUVJROW_SME +#define HAS_ARGBTOUVMATRIXROW_SME #define HAS_ARGBTOUVROW_SME #define HAS_BGRATOUVROW_SME #define HAS_CONVERT16TO8ROW_SME @@ -1834,6 +1844,43 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToUVMatrixRow_SME(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); + void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0b0290a97..b745710eb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1933 +#define LIBYUV_VERSION 1934 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index a1a7ba9bf..d9fb54778 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2245,6 +2245,32 @@ ARGBToUVMatrixRow_C; } #endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } +#endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 7e7a4f8cf..2c66611e6 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -574,6 +574,32 @@ ARGBToUVMatrixRow_C; } #endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } +#endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; @@ -915,6 +941,32 @@ ARGBToUVMatrixRow_C; } #endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } +#endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; @@ -4065,13 +4117,6 @@ int ARGBToAB64(const uint8_t* src_argb, return 0; } -// Enabled if 1 pass is available -#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV) -#define HAS_RAWTOYJROW -#endif - -// RAW to JNV21 full range NV21 -LIBYUV_API // Convert RAW to NV21 with Matrix. LIBYUV_API int RAWToNV21Matrix(const uint8_t* src_raw, @@ -4226,6 +4271,32 @@ int RAWToNV21Matrix(const uint8_t* src_raw, RAWToARGBRow = RAWToARGBRow_RVV; } #endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; + } + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; + } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_SME) + if (TestCpuFlag(kCpuHasSME)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; + } +#endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 9fa7227b7..82a4abe8d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2264,6 +2264,12 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } +#ifdef HAS_ARGBTOUVMATRIXROW_NEON +ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15) +#endif +#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM +ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVMATRIXROW_AVX2 ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) #endif diff --git a/source/row_neon.cc b/source/row_neon.cc index d1073352f..895e6f113 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1918,6 +1918,72 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vld1.8 {d18}, [%5] \n" // load kRGBToU + "vld1.8 {d19}, [%6] \n" // load kRGBToV + "vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17) + "vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19) + "vdup.16 q10, d16[0] \n" // U0 + "vdup.16 q11, d16[1] \n" // U1 + "vdup.16 q12, d16[2] \n" // U2 + "vdup.16 q13, d18[0] \n" // V0 + "vdup.16 q14, d18[1] \n" // V1 + "vdup.16 q15, d18[2] \n" // V2 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %4, %4, #16 \n" // 16 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #2 \n" // average of 4 + "vrshr.u16 q1, q1, #2 \n" + "vrshr.u16 q2, q2, #2 \n" + + "vmov.u16 q3, #0x8000 \n" // 128.0 + + "vmul.s16 q8, q0, q10 \n" // U = B * U0 + "vmla.s16 q8, q1, q11 \n" // U += G * U1 + "vmla.s16 q8, q2, q12 \n" // U += R * U2 + + "vmul.s16 q9, q0, q13 \n" // V = B * V0 + "vmla.s16 q9, q1, q14 \n" // V += G * V1 + "vmla.s16 q9, q2, q15 \n" // V += R * V2 + + "vsub.u16 q8, q3, q8 \n" // 128.0 - U + "vsub.u16 q9, q3, q9 \n" // 128.0 - V + + "vqshrn.u16 d0, q8, #8 \n" // Saturating shift right + "vqshrn.u16 d1, q9, #8 \n" + + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : "r"(&c->kRGBToU), // %5 + "r"(&c->kRGBToV) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 09bad8df9..19016cc3b 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/convert_from_argb.h" #ifdef __cplusplus namespace libyuv { @@ -2893,14 +2894,26 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. -void ARGBToUVRow_NEON(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { +void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( - RGBTOUV_SETUP_REG + "ldr q16, [%[c], #16] \n" // kRGBToU + "ldr q17, [%[c], #32] \n" // kRGBToV + "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit + "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit + "dup v20.8h, v16.h[0] \n" // U0 (-BU) + "dup v21.8h, v16.h[1] \n" // U1 (-GU) + "dup v22.8h, v16.h[2] \n" // U2 (-RU) + "dup v23.8h, v17.h[0] \n" // V0 (-BV) + "dup v24.8h, v17.h[1] \n" // V1 (-GV) + "dup v26.8h, v17.h[2] \n" // V2 (-RV) + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000) + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. @@ -2909,7 +2922,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -2919,7 +2932,20 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "urshr v1.8h, v1.8h, #2 \n" "urshr v2.8h, v2.8h, #2 \n" - RGBTOUV(v0.8h, v1.8h, v2.8h) + // U = B*U0 + G*U1 + R*U2 + "mul v3.8h, v0.8h, v20.8h \n" + "mla v3.8h, v1.8h, v21.8h \n" + "mla v3.8h, v2.8h, v22.8h \n" + + // V = B*V0 + G*V1 + R*V2 + "mul v4.8h, v0.8h, v23.8h \n" + "mla v4.8h, v1.8h, v24.8h \n" + "mla v4.8h, v2.8h, v26.8h \n" + + // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8 + "subhn v0.8b, v25.8h, v3.8h \n" + "subhn v1.8b, v25.8h, v4.8h \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" @@ -2928,12 +2954,21 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 - : + : [c] "r"(c) // %5 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25" + "v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26" ); } +void ARGBToUVRow_NEON(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width, + &kArgbI601Constants); +} + void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -3449,7 +3484,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. -static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src, +static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, int src_stride, uint8_t* dst_u, uint8_t* dst_v, @@ -3546,12 +3581,25 @@ static const int8_t kRGBAToUVCoefficients[] = { 0, -112, 74, 38, 0, 18, 94, -112, }; +void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + int8_t uvconstants[8] = { + (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], + (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, + uvconstants); +} + void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVCoefficients); } @@ -3560,7 +3608,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVCoefficients); } @@ -3569,7 +3617,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width, kBGRAToUVCoefficients); } @@ -3578,7 +3626,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width, kRGBAToUVCoefficients); } @@ -3606,7 +3654,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, kARGBToUVJCoefficients); } @@ -3615,7 +3663,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width, + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, kABGRToUVJCoefficients); } diff --git a/source/row_sme.cc b/source/row_sme.cc index bd61b20bf..fca536dc4 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1120,6 +1120,20 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y, : "cc", "memory", "z0", "z1", "z2", "p0", "p1"); } +__arm_locally_streaming void ARGBToUVMatrixRow_SME( + const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + int8_t uvconstants[8] = { + (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], + (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + uvconstants); +} + __arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_sve.cc b/source/row_sve.cc index 4a51b68fc..7d8734921 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -217,6 +217,19 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width); } +void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + int8_t uvconstants[8] = { + (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], + (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; + ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, + uvconstants); +} + void ARGBToUVRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_win.cc b/source/row_win.cc index 25f3ac9fe..77070d031 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -314,6 +314,95 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi } #endif +#ifdef HAS_ARGBTOUVMATRIXROW_AVX2 +LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall"))) +void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU)); + __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV)); + __m256i ymm_0101 = _mm256_set1_epi16(0x0101); + __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, + 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); + __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000); + __m256i ymm_zero = _mm256_setzero_si256(); + + while (width > 0) { + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb); + __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); + __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb)); + __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32)); + + ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); + ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); + ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); + ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf); + + ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101); + ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101); + ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101); + ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101); + + ymm0 = _mm256_add_epi16(ymm0, ymm2); + ymm1 = _mm256_add_epi16(ymm1, ymm3); + + ymm0 = _mm256_srli_epi16(ymm0, 1); + ymm1 = _mm256_srli_epi16(ymm1, 1); + ymm0 = _mm256_avg_epu16(ymm0, ymm_zero); + ymm1 = _mm256_avg_epu16(ymm1, ymm_zero); + + ymm0 = _mm256_packus_epi16(ymm0, ymm1); + ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); + + ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v); + ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u); + + ymm0 = _mm256_hadd_epi16(ymm0, ymm1); + ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8); + ymm0 = _mm256_sub_epi16(ymm_8000, ymm0); + ymm0 = _mm256_srli_epi16(ymm0, 8); + ymm0 = _mm256_packus_epi16(ymm0, ymm0); + + __m128i xmm_u = _mm256_castsi256_si128(ymm0); + __m128i xmm_v = _mm256_extracti128_si256(ymm0, 1); + + _mm_storel_epi64((__m128i*)dst_u, xmm_u); + _mm_storel_epi64((__m128i*)dst_v, xmm_v); + + src_argb += 64; + dst_u += 8; + dst_v += 8; + width -= 16; + } +} +#endif + +#ifdef HAS_MERGEUVROW_AVX2 +LIBYUV_TARGET_AVX2 +void MergeUVRow_AVX2(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + while (width > 0) { + __m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u)); + __m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v)); + + ymm1 = _mm256_slli_epi16(ymm1, 8); + ymm0 = _mm256_or_si256(ymm0, ymm1); + + _mm256_storeu_si256((__m256i*)dst_uv, ymm0); + + src_u += 16; + src_v += 16; + dst_uv += 32; + width -= 16; + } +} +#endif + #endif