ARGBToUVMatrixRow_NEON arm32 reimplemented for GCC

Bug: libyuv:508639302
Change-Id: Ib120373d799c66926a64c980873034be262d8848
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7810481
Commit-Queue: Frank Barchard <fbarchard@google.com>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2026-05-04 09:50:57 -07:00
parent f2ac6db694
commit 2143edfa7a
6 changed files with 73 additions and 15 deletions

View File

@ -433,7 +433,9 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#if !defined(__GNUC__) || defined(__clang__)
#define HAS_ARGBTOUVMATRIXROW_NEON
#endif
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#if !defined(__aarch64__)

View File

@ -2263,12 +2263,16 @@ ARGBToUVMatrixRow_C;
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)

View File

@ -592,12 +592,16 @@ ARGBToUVMatrixRow_C;
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
@ -959,12 +963,16 @@ ARGBToUVMatrixRow_C;
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
@ -4289,12 +4297,16 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
if (IS_ALIGNED(width, 2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)

View File

@ -2201,6 +2201,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
SIMD_ALIGNED(uint8_t vin[256 * 2]); \
SIMD_ALIGNED(uint8_t vout[256 * 2]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \
memset(vout, 0, sizeof(vout)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
@ -2244,6 +2245,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
SIMD_ALIGNED(uint8_t vin[256 * 2]); \
SIMD_ALIGNED(uint8_t vout[256 * 2]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \
memset(vout, 0, sizeof(vout)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \

View File

@ -1924,8 +1924,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vld1.8 {d18}, [%5] \n" // load kRGBToU
"vld1.8 {d19}, [%6] \n" // load kRGBToV
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
@ -1936,6 +1936,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"vdup.16 q13, d18[0] \n" // V0
"vdup.16 q14, d18[1] \n" // V1
"vdup.16 q15, d18[2] \n" // V2
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
@ -1963,17 +1964,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"vmla.s16 q9, q1, q14 \n" // V += G * V1
"vmla.s16 q9, q2, q15 \n" // V += R * V2
"vsub.u16 q8, q3, q8 \n" // 128.0 - U
"vsub.u16 q9, q3, q9 \n" // 128.0 - V
"vqshrn.u16 d0, q8, #8 \n" // Saturating shift right
"vqshrn.u16 d1, q9, #8 \n"
"vsubhn.s16 d0, q3, q8 \n" // 128.0 - U
"vsubhn.s16 d1, q3, q9 \n" // 128.0 - V
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4

View File

@ -2809,6 +2809,46 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
}
#endif
#ifdef ENABLE_ROW_TESTS
TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) {
const int kMaxWidth = 128;
SIMD_ALIGNED(uint8_t orig_argb_pixels[kMaxWidth * 4 * 2]);
SIMD_ALIGNED(uint8_t dest_u_c[kMaxWidth]);
SIMD_ALIGNED(uint8_t dest_v_c[kMaxWidth]);
SIMD_ALIGNED(uint8_t dest_u_opt[kMaxWidth]);
SIMD_ALIGNED(uint8_t dest_v_opt[kMaxWidth]);
for (int i = 0; i < kMaxWidth * 4 * 2; ++i) {
orig_argb_pixels[i] = i * 43;
}
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
for (int width = 1; width <= kMaxWidth; ++width) {
for (int height = 1; height <= 2; ++height) {
memset(dest_u_c, 0, sizeof(dest_u_c));
memset(dest_v_c, 0, sizeof(dest_v_c));
memset(dest_u_opt, 0, sizeof(dest_u_opt));
memset(dest_v_opt, 0, sizeof(dest_v_opt));
int src_stride = (height == 1) ? 0 : kMaxWidth * 4;
ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants);
ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants);
int half_width = (width + 1) / 2;
for (int i = 0; i < half_width; ++i) {
EXPECT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height;
EXPECT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height;
}
}
}
}
#endif
}
#endif
#if !defined(DISABLE_SLOW_TESTS) && \
(defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__))
// TODO(fbarchard): Consider _set_new_mode(0) to make malloc return NULL