Fix ARGBToUV444Row_NEON

- constants passed in are signed and need to be negated to positive.

Bug: 394127527
Change-Id: I531e475d2ddd4583922d4abef13b9282d002dd7a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6226854
Reviewed-by: Ben Weiss <bweiss@google.com>
This commit is contained in:
Frank Barchard 2025-02-03 13:16:38 -08:00
parent 96f98f6915
commit b3fd3f3f3b
3 changed files with 17 additions and 8 deletions

View File

@ -398,6 +398,7 @@ extern "C" {
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
// TODO: Fix ARGBTOYROW and test ARGBToI444 tests pass.
#define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON

View File

@ -1841,18 +1841,22 @@ static void ARGBToUV444MatrixRow_NEON(
"vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient
"vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient
"vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient
"vneg.s8 d25, d25 \n"
"vneg.s8 d26, d26 \n"
"vneg.s8 d27, d27 \n"
"vneg.s8 d28, d28 \n"
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vmlsl.u8 q2, d1, d25 \n" // G
"vmlsl.u8 q2, d2, d26 \n" // R
"vmull.u8 q3, d2, d24 \n" // R
"vmlal.u8 q3, d1, d28 \n" // G
"vmlal.u8 q3, d0, d27 \n" // B
"vmlsl.u8 q3, d1, d28 \n" // G
"vmlsl.u8 q3, d0, d27 \n" // B
"vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
"vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned

View File

@ -2729,19 +2729,23 @@ static void ARGBToUV444MatrixRow_NEON(
"dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
"dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
"dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
"neg v25.16b, v25.16b \n"
"neg v26.16b, v26.16b \n"
"neg v27.16b, v27.16b \n"
"neg v28.16b, v28.16b \n"
"movi v29.16b, #0x80 \n" // 128.5
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
"prfm pldl1keep, [%0, 448] \n"
"umull v3.8h, v2.8b, v24.8b \n" // R
"umlal v3.8h, v1.8b, v28.8b \n" // G
"umlal v3.8h, v0.8b, v27.8b \n" // B
"umlsl v3.8h, v1.8b, v28.8b \n" // G
"umlsl v3.8h, v0.8b, v27.8b \n" // B
"addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
"addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned