RAWToNV21 using SME, SVE, I8MM or Neon

Pixel 9 Now SVE2 2 pass LibYUVConvertTest.RAWToNV21_Opt (364 ms)
 31.76% libyuv::ARGBToUVMatrixRow_SVE_SC()
 30.38% RAWToARGBRow_SVE2
 26.81% ARGBToYMatrixRow_NEON_DotProd
  3.26% MergeUVRow_NEON

Was NEON 1 pass LibYUVConvertTest.RAWToJNV21_Opt (295 ms)
 44.14% RAWToYJRow_NEON
 41.91% RAWToUVJRow_NEON
  5.11% MergeUVRow_NEON

Clang on Intel Skylake clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt
(301 ms) visual c (row_win) [ OK ] LibYUVConvertTest.RAWToJNV21_Opt
(2056 ms)

clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (275 ms) visual c [ OK ]
LibYUVConvertTest.RAWToJNV21_Opt (365 ms)

Bug: libyuv:42280902
Change-Id: Iaba558ebe96ce6b9881ee9335ba72b8aac390cde
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7802432
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
This commit is contained in:
Frank Barchard 2026-04-29 12:54:49 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent b438739c8b
commit f2ac6db694
11 changed files with 407 additions and 27 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1933
Version: 1934
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -140,6 +140,13 @@ extern "C" {
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__) || \
defined(_M_X64) || defined(_M_X86))
#define HAS_ARGBTOUVMATRIXROW_AVX2
#define HAS_MERGEUVROW_AVX2
#endif
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
defined(GCC_HAS_AVX2))
@ -163,7 +170,6 @@ extern "C" {
#define HAS_I444TORGB24ROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB24ROW_AVX2
@ -427,6 +433,7 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVMATRIXROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#if !defined(__aarch64__)
@ -573,6 +580,7 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON_I8MM
#define HAS_ARGBTOUVJ444ROW_NEON_I8MM
#define HAS_ARGBTOUVJROW_NEON_I8MM
#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUVROW_NEON_I8MM
#define HAS_BGRATOUVROW_NEON_I8MM
#define HAS_RGBATOUVROW_NEON_I8MM
@ -588,6 +596,7 @@ extern "C" {
#define HAS_ARGBTORGB565DITHERROW_SVE2
#define HAS_ARGBTORGB565ROW_SVE2
#define HAS_ARGBTOUVJROW_SVE2
#define HAS_ARGBTOUVMATRIXROW_SVE2
#define HAS_ARGBTOUVROW_SVE2
#define HAS_AYUVTOUVROW_SVE2
#define HAS_AYUVTOVUROW_SVE2
@ -639,6 +648,7 @@ extern "C" {
#define HAS_ABGRTOUVROW_SME
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_ARGBTOUVJROW_SME
#define HAS_ARGBTOUVMATRIXROW_SME
#define HAS_ARGBTOUVROW_SME
#define HAS_BGRATOUVROW_SME
#define HAS_CONVERT16TO8ROW_SME
@ -1834,6 +1844,43 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1933
#define LIBYUV_VERSION 1934
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -2245,6 +2245,32 @@ ARGBToUVMatrixRow_C;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;

View File

@ -574,6 +574,32 @@ ARGBToUVMatrixRow_C;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -915,6 +941,32 @@ ARGBToUVMatrixRow_C;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -4065,13 +4117,6 @@ int ARGBToAB64(const uint8_t* src_argb,
return 0;
}
// Enabled if 1 pass is available
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
#define HAS_RAWTOYJROW
#endif
// RAW to JNV21 full range NV21
LIBYUV_API
// Convert RAW to NV21 with Matrix.
LIBYUV_API
int RAWToNV21Matrix(const uint8_t* src_raw,
@ -4226,6 +4271,32 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
RAWToARGBRow = RAWToARGBRow_RVV;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;

View File

@ -2264,6 +2264,12 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \
}
#ifdef HAS_ARGBTOUVMATRIXROW_NEON
ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
#endif

View File

@ -1918,6 +1918,72 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vld1.8 {d18}, [%5] \n" // load kRGBToU
"vld1.8 {d19}, [%6] \n" // load kRGBToV
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
"vdup.16 q10, d16[0] \n" // U0
"vdup.16 q11, d16[1] \n" // U1
"vdup.16 q12, d16[2] \n" // U2
"vdup.16 q13, d18[0] \n" // V0
"vdup.16 q14, d18[1] \n" // V1
"vdup.16 q15, d18[2] \n" // V2
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
"vmov.u16 q3, #0x8000 \n" // 128.0
"vmul.s16 q8, q0, q10 \n" // U = B * U0
"vmla.s16 q8, q1, q11 \n" // U += G * U1
"vmla.s16 q8, q2, q12 \n" // U += R * U2
"vmul.s16 q9, q0, q13 \n" // V = B * V0
"vmla.s16 q9, q1, q14 \n" // V += G * V1
"vmla.s16 q9, q2, q15 \n" // V += R * V2
"vsub.u16 q8, q3, q8 \n" // 128.0 - U
"vsub.u16 q9, q3, q9 \n" // 128.0 - V
"vqshrn.u16 d0, q8, #8 \n" // Saturating shift right
"vqshrn.u16 d1, q9, #8 \n"
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
: "r"(&c->kRGBToU), // %5
"r"(&c->kRGBToV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/convert_from_argb.h"
#ifdef __cplusplus
namespace libyuv {
@ -2893,14 +2894,26 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
void ARGBToUVRow_NEON(const uint8_t* src_argb,
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int width,
const struct ArgbConstants* c) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
"ldr q16, [%[c], #16] \n" // kRGBToU
"ldr q17, [%[c], #32] \n" // kRGBToV
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
@ -2909,7 +2922,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@ -2919,7 +2932,20 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
// U = B*U0 + G*U1 + R*U2
"mul v3.8h, v0.8h, v20.8h \n"
"mla v3.8h, v1.8h, v21.8h \n"
"mla v3.8h, v2.8h, v22.8h \n"
// V = B*V0 + G*V1 + R*V2
"mul v4.8h, v0.8h, v23.8h \n"
"mla v4.8h, v1.8h, v24.8h \n"
"mla v4.8h, v2.8h, v26.8h \n"
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
"subhn v0.8b, v25.8h, v3.8h \n"
"subhn v1.8b, v25.8h, v4.8h \n"
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
@ -2928,12 +2954,21 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: [c] "r"(c) // %5
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
);
}
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
&kArgbI601Constants);
}
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -3449,7 +3484,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
}
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src,
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
@ -3546,12 +3581,25 @@ static const int8_t kRGBAToUVCoefficients[] = {
0, -112, 74, 38, 0, 18, 94, -112,
};
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
}
@ -3560,7 +3608,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
}
@ -3569,7 +3617,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
}
@ -3578,7 +3626,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
}
@ -3606,7 +3654,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
}
@ -3615,7 +3663,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVJCoefficients);
}

View File

@ -1120,6 +1120,20 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
}
__arm_locally_streaming void ARGBToUVMatrixRow_SME(
const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -217,6 +217,19 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -314,6 +314,95 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi
}
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
__m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
__m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
__m256i ymm_0101 = _mm256_set1_epi16(0x0101);
__m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
__m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
__m256i ymm_zero = _mm256_setzero_si256();
while (width > 0) {
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf);
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101);
ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101);
ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101);
ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101);
ymm0 = _mm256_add_epi16(ymm0, ymm2);
ymm1 = _mm256_add_epi16(ymm1, ymm3);
ymm0 = _mm256_srli_epi16(ymm0, 1);
ymm1 = _mm256_srli_epi16(ymm1, 1);
ymm0 = _mm256_avg_epu16(ymm0, ymm_zero);
ymm1 = _mm256_avg_epu16(ymm1, ymm_zero);
ymm0 = _mm256_packus_epi16(ymm0, ymm1);
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v);
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u);
ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
ymm0 = _mm256_sub_epi16(ymm_8000, ymm0);
ymm0 = _mm256_srli_epi16(ymm0, 8);
ymm0 = _mm256_packus_epi16(ymm0, ymm0);
__m128i xmm_u = _mm256_castsi256_si128(ymm0);
__m128i xmm_v = _mm256_extracti128_si256(ymm0, 1);
_mm_storel_epi64((__m128i*)dst_u, xmm_u);
_mm_storel_epi64((__m128i*)dst_v, xmm_v);
src_argb += 64;
dst_u += 8;
dst_v += 8;
width -= 16;
}
}
#endif
#ifdef HAS_MERGEUVROW_AVX2
LIBYUV_TARGET_AVX2
void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
while (width > 0) {
__m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u));
__m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v));
ymm1 = _mm256_slli_epi16(ymm1, 8);
ymm0 = _mm256_or_si256(ymm0, ymm1);
_mm256_storeu_si256((__m256i*)dst_uv, ymm0);
src_u += 16;
src_v += 16;
dst_uv += 32;
width -= 16;
}
}
#endif
#endif