diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9ee8af68f..d8fe2137d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -398,6 +398,7 @@ extern "C" { #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON +// TODO: Fix ARGBTOYROW and test ARGBToI444 tests pass. #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON diff --git a/source/row_neon.cc b/source/row_neon.cc index d5e36cdef..49d7584dc 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1841,18 +1841,22 @@ static void ARGBToUV444MatrixRow_NEON( "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient + "vneg.s8 d25, d25 \n" + "vneg.s8 d26, d26 \n" + "vneg.s8 d27, d27 \n" + "vneg.s8 d28, d28 \n" "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R + "vmlsl.u8 q2, d1, d25 \n" // G + "vmlsl.u8 q2, d2, d26 \n" // R "vmull.u8 q3, d2, d24 \n" // R - "vmlal.u8 q3, d1, d28 \n" // G - "vmlal.u8 q3, d0, d27 \n" // B + "vmlsl.u8 q3, d1, d28 \n" // G + "vmlsl.u8 q3, d0, d27 \n" // B "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 8ef9d4975..c30ef680c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2729,19 +2729,23 @@ static void ARGBToUV444MatrixRow_NEON( "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient + "neg v25.16b, v25.16b \n" + "neg v26.16b, v26.16b \n" + "neg v27.16b, v27.16b \n" + "neg v28.16b, v28.16b \n" "movi v29.16b, #0x80 \n" // 128.5 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B - "umlal v4.8h, v1.8b, v25.8b \n" // G - "umlal v4.8h, v2.8b, v26.8b \n" // R + "umlsl v4.8h, v1.8b, v25.8b \n" // G + "umlsl v4.8h, v2.8b, v26.8b \n" // R "prfm pldl1keep, [%0, 448] \n" "umull v3.8h, v2.8b, v24.8b \n" // R - "umlal v3.8h, v1.8b, v28.8b \n" // G - "umlal v3.8h, v0.8b, v27.8b \n" // B + "umlsl v3.8h, v1.8b, v28.8b \n" // G + "umlsl v3.8h, v0.8b, v27.8b \n" // B "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned