[AArch64] Add SME implementation of ARGBToUVRow and similar

Mostly just a straightforward copy of the existing SVE2 code ported to
Streaming-SVE. Introduce new "any" kernels for non-multiple of two
cases, matching what we already do for SVE2.

The existing SVE2 code makes use of the Neon MOVI instruction that is
not supported in Streaming-SVE, so adjust the code to use FMOV instead
which has the same performance characteristics.

Change-Id: I74b7ea1fe8e6af75dfaf92826a4de775a1559f77
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6663806
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2025-06-13 09:13:02 +01:00 committed by Frank Barchard
parent 9519b7df0e
commit 007b920232
7 changed files with 451 additions and 184 deletions

View File

@ -615,7 +615,12 @@ extern "C" {
// The following are available on AArch64 SME platforms:
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
#define HAS_ABGRTOUVJROW_SME
#define HAS_ABGRTOUVROW_SME
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_ARGBTOUVJROW_SME
#define HAS_ARGBTOUVROW_SME
#define HAS_BGRATOUVROW_SME
#define HAS_CONVERT16TO8ROW_SME
#define HAS_CONVERT8TO16ROW_SME
#define HAS_CONVERT8TO8ROW_SME
@ -654,6 +659,7 @@ extern "C" {
#define HAS_P210TOARGBROW_SME
#define HAS_P410TOAR30ROW_SME
#define HAS_P410TOARGBROW_SME
#define HAS_RGBATOUVROW_SME
#define HAS_YUY2TOARGBROW_SME
#endif
@ -1954,6 +1960,11 @@ void ARGBToUVRow_SVE2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_MSA(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1996,6 +2007,11 @@ void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_uj,
@ -2011,6 +2027,11 @@ void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width);
void ABGRToUVJRow_SME(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width);
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
@ -2026,6 +2047,11 @@ void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_SME(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
@ -2041,6 +2067,11 @@ void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_SME(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
@ -2056,6 +2087,11 @@ void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_SME(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
@ -2503,6 +2539,11 @@ void ARGBToUVRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@ -2545,6 +2586,11 @@ void ARGBToUVJRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
@ -2560,6 +2606,11 @@ void ABGRToUVJRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVJRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
@ -2575,6 +2626,11 @@ void BGRAToUVRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
@ -2590,6 +2646,11 @@ void ABGRToUVRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
@ -2605,6 +2666,11 @@ void RGBAToUVRow_Any_SVE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_SME(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,

View File

@ -1971,6 +1971,179 @@ static inline void Convert8To8Row_SVE_SC(const uint8_t* src_y,
: "cc", "memory", "z0", "z1", "z2", "z3", "p0", "p1");
}
// SVE constants are stored negated such that we can store 128 in int8_t.
// RGB to BT601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
static const int8_t kARGBToUVCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-112, 74, 38, 0, 18, 94, -112, 0,
};
static const int8_t kABGRToUVCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
38, 74, -112, 0, -112, 94, 18, 0,
};
static const int8_t kBGRAToUVCoefficients[] = {
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
0, 38, 74, -112, 0, -112, 94, 18,
};
static const int8_t kRGBAToUVCoefficients[] = {
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
0, -112, 74, 38, 0, 18, 94, -112,
};
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
static const int8_t kARGBToUVJCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-128, 85, 43, 0, 21, 107, -128, 0,
};
static const int8_t kABGRToUVJCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
43, 85, -128, 0, -128, 107, 21, 0,
};
#define ABCDTOUVMATRIX_SVE \
"ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \
"ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \
"ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \
"ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \
"incb %[src0], all, mul #4 \n" \
"incb %[src1], all, mul #4 \n" \
\
"uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \
"uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \
"uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \
"uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \
"uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \
"uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \
"uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \
"uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \
\
/* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
"addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \
"addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \
"addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \
"addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \
\
"rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \
"rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \
"rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \
"rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \
\
"tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \
"tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \
\
"subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \
\
"fmov s16, wzr \n" \
"fmov s17, wzr \n" \
"fmov s20, wzr \n" \
"fmov s21, wzr \n" \
\
"usdot z16.s, z0.b, z24.b \n" \
"usdot z17.s, z1.b, z24.b \n" \
"usdot z20.s, z0.b, z25.b \n" \
"usdot z21.s, z1.b, z25.b \n" \
\
"subhnb z16.b, z26.h, z16.h \n" /* U */ \
"subhnb z20.b, z26.h, z20.h \n" /* V */ \
"subhnb z17.b, z26.h, z17.h \n" /* U */ \
"subhnb z21.b, z26.h, z21.h \n" /* V */ \
\
"uzp1 z16.h, z16.h, z17.h \n" \
"uzp1 z20.h, z20.h, z21.h \n" \
\
"st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \
"st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \
"inch %[dst_u] \n" \
"inch %[dst_v] \n"
static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const int8_t* uvconstants)
STREAMING_COMPATIBLE {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl;
asm("cntd %x0" : "=r"(vl));
// Width is a multiple of two here, so halve it.
width >>= 1;
asm volatile(
"ptrue p0.b \n"
"ld1rw {z24.s}, p0/z, [%[uvconstants]] \n"
"ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n"
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
// Generate some TBL indices to undo the interleaving from ADDP.
"index z0.s, #0, #1 \n"
"index z1.s, #1, #1 \n"
"uzp1 z27.s, z0.s, z1.s \n"
"subs %w[width], %w[width], %w[vl], lsl #2 \n"
"b.lt 2f \n"
"ptrue p1.d \n"
"ptrue p2.d \n"
"ptrue p3.d \n"
"ptrue p4.d \n"
"ptrue p5.h \n"
"1: \n" //
ABCDTOUVMATRIX_SVE
"b.gt 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl], lsl #2 \n"
"b.eq 99f \n"
"3: \n"
"whilelt p1.d, wzr, %w[width] \n"
"whilelt p2.d, %w[vl], %w[width] \n"
"whilelt p3.d, %w[vl2], %w[width] \n"
"whilelt p4.d, %w[vl3], %w[width] \n"
"whilelt p5.h, wzr, %w[width] \n" //
ABCDTOUVMATRIX_SVE
"b.gt 3b \n"
"99: \n"
: [src0] "+r"(src_argb), // %[src0]
[src1] "+r"(src_argb_1), // %[src1]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [uvconstants] "r"(uvconstants), // %[uvconstants]
[vl] "r"(vl), // %[vl]
[vl2] "r"(vl * 2), // %[vl2]
[vl3] "r"(vl * 3) // %[vl3]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
"z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
"z27", "p0", "p1", "p2", "p3", "p4", "p5");
}
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -2103,6 +2103,14 @@ int ARGBToI420(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@ -2289,6 +2297,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@ -2476,6 +2492,14 @@ int BGRAToI420(const uint8_t* src_bgra,
}
}
#endif
#if defined(HAS_BGRATOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
BGRAToUVRow = BGRAToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
BGRAToUVRow = BGRAToUVRow_SME;
}
}
#endif
#if defined(HAS_BGRATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
BGRAToYRow = BGRAToYRow_Any_SSSE3;
@ -2659,6 +2683,14 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ABGRToUVRow = ABGRToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ABGRToUVRow = ABGRToUVRow_SME;
}
}
#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@ -2792,6 +2824,14 @@ int RGBAToI420(const uint8_t* src_rgba,
}
}
#endif
#if defined(HAS_RGBATOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
RGBAToUVRow = RGBAToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
RGBAToUVRow = RGBAToUVRow_SME;
}
}
#endif
#if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYRow = RGBAToYRow_Any_MSA;

View File

@ -287,6 +287,14 @@ int ARGBToI422(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@ -411,6 +419,14 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@ -662,6 +678,14 @@ int ARGBToNV21(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@ -890,6 +914,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ABGRToUVRow = ABGRToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ABGRToUVRow = ABGRToUVRow_SME;
}
}
#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@ -1107,6 +1139,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ABGRToUVRow = ABGRToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ABGRToUVRow = ABGRToUVRow_SME;
}
}
#endif
#if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYRow = ABGRToYRow_Any_MSA;
@ -1329,6 +1369,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@ -1549,6 +1597,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVRow = ARGBToUVRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVRow = ARGBToUVRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA;
@ -2697,6 +2753,14 @@ int ARGBToJ420(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVJRow = ARGBToUVJRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@ -2889,6 +2953,14 @@ int ARGBToJ422(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVJRow = ARGBToUVJRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA;
@ -3211,6 +3283,14 @@ int ABGRToJ420(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOUVJROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ABGRToUVJRow = ABGRToUVJRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ABGRToUVJRow = ABGRToUVJRow_SME;
}
}
#endif
#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYJRow = ABGRToYJRow_Any_MSA;
@ -3365,6 +3445,14 @@ int ABGRToJ422(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOUVJROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ABGRToUVJRow = ABGRToUVJRow_Any_SME;
if (IS_ALIGNED(width, 2)) {
ABGRToUVJRow = ABGRToUVJRow_SME;
}
}
#endif
#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYJRow = ABGRToYJRow_Any_MSA;

View File

@ -2404,6 +2404,9 @@ ANY12S(ARGBToUVRow_Any_NEON_I8MM, ARGBToUVRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_ARGBTOUVROW_SVE2
ANY12S(ARGBToUVRow_Any_SVE2, ARGBToUVRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_ARGBTOUVROW_SME
ANY12S(ARGBToUVRow_Any_SME, ARGBToUVRow_SME, 0, 4, 1)
#endif
#ifdef HAS_ARGBTOUVROW_MSA
ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
#endif
@ -2422,6 +2425,9 @@ ANY12S(ARGBToUVJRow_Any_NEON_I8MM, ARGBToUVJRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_ARGBTOUVJROW_SVE2
ANY12S(ARGBToUVJRow_Any_SVE2, ARGBToUVJRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_ARGBTOUVJROW_SME
ANY12S(ARGBToUVJRow_Any_SME, ARGBToUVJRow_SME, 0, 4, 1)
#endif
#ifdef HAS_ABGRTOUVJROW_NEON
ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15)
#endif
@ -2431,6 +2437,9 @@ ANY12S(ABGRToUVJRow_Any_NEON_I8MM, ABGRToUVJRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_ABGRTOUVJROW_SVE2
ANY12S(ABGRToUVJRow_Any_SVE2, ABGRToUVJRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_ABGRTOUVJROW_SME
ANY12S(ABGRToUVJRow_Any_SME, ABGRToUVJRow_SME, 0, 4, 1)
#endif
#ifdef HAS_ARGBTOUVJROW_MSA
ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
#endif
@ -2449,6 +2458,9 @@ ANY12S(BGRAToUVRow_Any_NEON_I8MM, BGRAToUVRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_BGRATOUVROW_SVE2
ANY12S(BGRAToUVRow_Any_SVE2, BGRAToUVRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_BGRATOUVROW_SME
ANY12S(BGRAToUVRow_Any_SME, BGRAToUVRow_SME, 0, 4, 1)
#endif
#ifdef HAS_BGRATOUVROW_MSA
ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15)
#endif
@ -2464,6 +2476,9 @@ ANY12S(ABGRToUVRow_Any_NEON_I8MM, ABGRToUVRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_ABGRTOUVROW_SVE2
ANY12S(ABGRToUVRow_Any_SVE2, ABGRToUVRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_ABGRTOUVROW_SME
ANY12S(ABGRToUVRow_Any_SME, ABGRToUVRow_SME, 0, 4, 1)
#endif
#ifdef HAS_ABGRTOUVROW_MSA
ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15)
#endif
@ -2479,6 +2494,9 @@ ANY12S(RGBAToUVRow_Any_NEON_I8MM, RGBAToUVRow_NEON_I8MM, 0, 4, 15)
#ifdef HAS_RGBATOUVROW_SVE2
ANY12S(RGBAToUVRow_Any_SVE2, RGBAToUVRow_SVE2, 0, 4, 1)
#endif
#ifdef HAS_RGBATOUVROW_SME
ANY12S(RGBAToUVRow_Any_SME, RGBAToUVRow_SME, 0, 4, 1)
#endif
#ifdef HAS_RGBATOUVROW_MSA
ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15)
#endif

View File

@ -1120,6 +1120,60 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
}
__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
}
__arm_locally_streaming void ARGBToUVJRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
}
__arm_locally_streaming void ABGRToUVJRow_SME(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
kABGRToUVJCoefficients);
}
__arm_locally_streaming void BGRAToUVRow_SME(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
}
__arm_locally_streaming void ABGRToUVRow_SME(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
}
__arm_locally_streaming void RGBAToUVRow_SME(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE_SC(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
}
#endif // !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) &&
// defined(__aarch64__)

View File

@ -217,185 +217,13 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
// SVE constants are stored negated such that we can store 128 in int8_t.
// RGB to BT601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
static const int8_t kARGBToUVCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-112, 74, 38, 0, 18, 94, -112, 0,
};
static const int8_t kABGRToUVCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
38, 74, -112, 0, -112, 94, 18, 0,
};
static const int8_t kBGRAToUVCoefficients[] = {
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
0, 38, 74, -112, 0, -112, 94, 18,
};
static const int8_t kRGBAToUVCoefficients[] = {
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
0, -112, 74, 38, 0, 18, 94, -112,
};
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
static const int8_t kARGBToUVJCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-128, 85, 43, 0, 21, 107, -128, 0,
};
static const int8_t kABGRToUVJCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
43, 85, -128, 0, -128, 107, 21, 0,
};
#define ABCDTOUVMATRIX_SVE \
"ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \
"ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \
"ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \
"ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \
"ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \
"incb %[src0], all, mul #4 \n" \
"incb %[src1], all, mul #4 \n" \
\
"uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \
"uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \
"uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \
"uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \
"uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \
"uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \
"uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \
"uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \
\
/* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \
"addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \
"addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \
"addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \
"addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \
\
"rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \
"rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \
"rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \
"rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \
\
"tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \
"tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \
\
"subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \
\
"movi v16.8h, #0 \n" \
"movi v17.8h, #0 \n" \
"movi v20.8h, #0 \n" \
"movi v21.8h, #0 \n" \
\
"usdot z16.s, z0.b, z24.b \n" \
"usdot z17.s, z1.b, z24.b \n" \
"usdot z20.s, z0.b, z25.b \n" \
"usdot z21.s, z1.b, z25.b \n" \
\
"subhnb z16.b, z26.h, z16.h \n" /* U */ \
"subhnb z20.b, z26.h, z20.h \n" /* V */ \
"subhnb z17.b, z26.h, z17.h \n" /* U */ \
"subhnb z21.b, z26.h, z21.h \n" /* V */ \
\
"uzp1 z16.h, z16.h, z17.h \n" \
"uzp1 z20.h, z20.h, z21.h \n" \
\
"st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \
"st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \
"inch %[dst_u] \n" \
"inch %[dst_v] \n"
static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const int8_t* uvconstants) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl;
asm("cntd %x0" : "=r"(vl));
// Width is a multiple of two here, so halve it.
width >>= 1;
asm volatile(
"ptrue p0.b \n"
"ld1rw {z24.s}, p0/z, [%[uvconstants]] \n"
"ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n"
"mov z26.h, #0x8000 \n" // 128.0 (0x8000)
// Generate some TBL indices to undo the interleaving from ADDP.
"index z0.s, #0, #1 \n"
"index z1.s, #1, #1 \n"
"uzp1 z27.s, z0.s, z1.s \n"
"subs %w[width], %w[width], %w[vl], lsl #2 \n"
"b.lt 2f \n"
"ptrue p1.d \n"
"ptrue p2.d \n"
"ptrue p3.d \n"
"ptrue p4.d \n"
"ptrue p5.h \n"
"1: \n" //
ABCDTOUVMATRIX_SVE
"b.gt 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl], lsl #2 \n"
"b.eq 99f \n"
"3: \n"
"whilelt p1.d, wzr, %w[width] \n"
"whilelt p2.d, %w[vl], %w[width] \n"
"whilelt p3.d, %w[vl2], %w[width] \n"
"whilelt p4.d, %w[vl3], %w[width] \n"
"whilelt p5.h, wzr, %w[width] \n" //
ABCDTOUVMATRIX_SVE
"b.gt 3b \n"
"99: \n"
: [src0] "+r"(src_argb), // %[src0]
[src1] "+r"(src_argb_1), // %[src1]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [uvconstants] "r"(uvconstants), // %[uvconstants]
[vl] "r"(vl), // %[vl]
[vl2] "r"(vl * 2), // %[vl2]
[vl3] "r"(vl * 3) // %[vl3]
: "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16",
"z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26",
"z27", "p0", "p1", "p2", "p3", "p4", "p5");
}
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
}
void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
@ -403,8 +231,8 @@ void ARGBToUVJRow_SVE2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE2(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
}
void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
@ -412,8 +240,8 @@ void ABGRToUVJRow_SVE2(const uint8_t* src_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width) {
ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
kABGRToUVJCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
kABGRToUVJCoefficients);
}
void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
@ -421,8 +249,8 @@ void BGRAToUVRow_SVE2(const uint8_t* src_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE2(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
}
void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
@ -430,8 +258,8 @@ void ABGRToUVRow_SVE2(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE2(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
}
void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
@ -439,8 +267,8 @@ void RGBAToUVRow_SVE2(const uint8_t* src_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_SVE2(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
ARGBToUVMatrixRow_SVE_SC(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
}
#define ARGBTORGB565_SVE \