[AArch64] Add Neon implementation for I422ToAR30Row_NEON

There is an existing x86 implementation for this kernel, but not for AArch64, so add one. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: Cortex-A55: -43.1% Cortex-A510: -22.3% Cortex-A76: -54.8% Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com> Bug: libyuv:976 Change-Id: Ifead36bcb8682a527136223e0dcd210e9abe744a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607763 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-02-06 09:49:50 +08:00 · 2024-04-10 16:36:26 +01:00 · 2024-04-10 16:36:26 +01:00 · d32436e8f8
commit d32436e8f8
parent bbd9cedc4f
4 changed files with 50 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -563,6 +563,7 @@ extern "C" {
 #define HAS_I410TOAR30ROW_NEON
 #define HAS_I212TOARGBROW_NEON
 #define HAS_I212TOAR30ROW_NEON
+#define HAS_I422TOAR30ROW_NEON

 #define HAS_ABGRTOYJROW_NEON_DOTPROD
 #define HAS_ABGRTOYROW_NEON_DOTPROD
@ -1148,6 +1149,12 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToAR30Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,
@ -5267,6 +5274,12 @@ void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I422ToAR30Row_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
                            const uint8_t* uv_buf,
                            uint8_t* dst_ptr,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -6241,6 +6241,14 @@ int I420ToAR30Matrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToAR30Row = I422ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_NEON;
+    }
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -420,6 +420,9 @@ ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #ifdef HAS_I444TORGB24ROW_NEON
 ANY31C(I444ToRGB24Row_Any_NEON, I444ToRGB24Row_NEON, 0, 0, 3, 7)
 #endif
+#ifdef HAS_I422TOAR30ROW_NEON
+ANY31C(I422ToAR30Row_Any_NEON, I422ToAR30Row_NEON, 1, 0, 4, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -440,6 +440,32 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
      : "cc", "memory", YUVTORGB_REGS, "v19");
 }

+void I422ToAR30Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  const uint16_t limit = 0x3ff0;
+  asm(YUVTORGB_SETUP
+      "dup      v22.8h, %w[limit]                  \n"
+      "movi     v23.8h, #0xc0, lsl #8              \n"  // A
+      "1:                                          \n" READYUV422 I4XXTORGB
+      "subs     %w[width], %w[width], #8           \n" STOREAR30
+      "b.gt     1b                                 \n"
+      : [src_y] "+r"(src_y),             // %[src_y]
+        [src_u] "+r"(src_u),             // %[src_u]
+        [src_v] "+r"(src_v),             // %[src_v]
+        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+        [width] "+r"(width)              // %[width]
+      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+        [limit] "r"(limit)               // %[limit]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+}
+
 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
                             const uint8_t* src_u,
                             const uint8_t* src_v,