[AArch64] Add Neon implementations for {ARGB,ABGR}ToAR30Row

There are existing x86 implementations for these kernels but not for AArch64, so add them. Reduction in runtimes, compared to the existing C code compiled with LLVM 17: | ABGRToAR30Row | ARGBToAR30Row Cortex-A55 | -55.1% | -55.1% Cortex-A510 | -39.3% | -40.1% Cortex-A76 | -62.3% | -63.6% Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com> Bug: libyuv:976 Change-Id: I307f03bddcbe5429c2d3ab2f42aa023a3539ddd0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465592 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-04-10 16:36:25 +01:00 · 2024-04-10 16:36:25 +01:00 · 9fac9a4a82
commit 9fac9a4a82
parent 83c48c782a
4 changed files with 86 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -553,6 +553,9 @@ extern "C" {

 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_ARGBTOAR30ROW_NEON
+#define HAS_ABGRTOAR30ROW_NEON
+
 #define HAS_ABGRTOYJROW_NEON_DOTPROD
 #define HAS_ABGRTOYROW_NEON_DOTPROD
 #define HAS_ARGBTOYJROW_NEON_DOTPROD
@ -5136,6 +5139,14 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
+void ARGBToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
                           const uint8_t* u_buf,
                           const uint8_t* v_buf,
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -2268,6 +2268,14 @@ int ABGRToAR30(const uint8_t* src_abgr,
    height = 1;
    src_stride_abgr = dst_stride_ar30 = 0;
  }
+#if defined(HAS_ABGRTOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToAR30Row = ABGRToAR30Row_NEON;
+    }
+  }
+#endif
 #if defined(HAS_ABGRTOAR30ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
@ -2317,6 +2325,14 @@ int ARGBToAR30(const uint8_t* src_argb,
    height = 1;
    src_stride_argb = dst_stride_ar30 = 0;
  }
+#if defined(HAS_ARGBTOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_NEON;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOAR30ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -948,6 +948,12 @@ ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
 #if defined(HAS_ABGRTOAR30ROW_SSSE3)
 ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
 #endif
+#if defined(HAS_ABGRTOAR30ROW_NEON)
+ANY11(ABGRToAR30Row_Any_NEON, ABGRToAR30Row_NEON, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_NEON)
+ANY11(ARGBToAR30Row_Any_NEON, ARGBToAR30Row_NEON, 0, 4, 4, 7)
+#endif
 #if defined(HAS_ARGBTOAR30ROW_SSSE3)
 ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
 #endif
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -1722,6 +1722,59 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
  );
 }

+static const int16_t kAR30Row_BoxShifts[] = {0, -6, 0, -6, 0, -6, 0, -6};
+
+static const uint8_t kABGRToAR30Row_BoxIndices[] = {
+    2, 2, 1, 1, 6, 6, 5, 5, 10, 10, 9,  9,  14, 14, 13, 13,
+    0, 0, 3, 3, 4, 4, 7, 7, 8,  8,  11, 11, 12, 12, 15, 15};
+static const uint8_t kARGBToAR30Row_BoxIndices[] = {
+    0, 0, 1, 1, 4, 4, 5, 5, 8,  8,  9,  9,  12, 12, 13, 13,
+    2, 2, 3, 3, 6, 6, 7, 7, 10, 10, 11, 11, 14, 14, 15, 15};
+
+// ARGB or ABGR as input, reordering based on TBL indices parameter.
+static void ABCDToAR30Row_NEON(const uint8_t* src_abcd,
+                               uint8_t* dst_ar30,
+                               int width,
+                               const uint8_t* indices) {
+  asm volatile(
+      "movi      v2.4s, #0xf, msl 16             \n"  // 0xfffff
+      "ldr       q3, [%[kAR30Row_BoxShifts]]     \n"
+      "ldp       q4, q5, [%[indices]]            \n"
+      "1:                                        \n"
+      "ldp       q0, q20, [%[src]], #32          \n"
+      "subs      %w[width], %w[width], #8        \n"
+      "tbl       v1.16b, {v0.16b}, v5.16b        \n"
+      "tbl       v21.16b, {v20.16b}, v5.16b      \n"
+      "tbl       v0.16b, {v0.16b}, v4.16b        \n"
+      "tbl       v20.16b, {v20.16b}, v4.16b      \n"
+      "ushl      v0.8h, v0.8h, v3.8h             \n"
+      "ushl      v20.8h, v20.8h, v3.8h           \n"
+      "ushl      v1.8h, v1.8h, v3.8h             \n"
+      "ushl      v21.8h, v21.8h, v3.8h           \n"
+      "ushr      v0.4s, v0.4s, #6                \n"
+      "ushr      v20.4s, v20.4s, #6              \n"
+      "shl       v1.4s, v1.4s, #14               \n"
+      "shl       v21.4s, v21.4s, #14             \n"
+      "bif       v0.16b, v1.16b, v2.16b          \n"
+      "bif       v20.16b, v21.16b, v2.16b        \n"
+      "stp       q0, q20, [%[dst]], #32          \n"
+      "b.gt      1b                              \n"
+      : [src] "+r"(src_abcd),                          // %[src]
+        [dst] "+r"(dst_ar30),                          // %[dst]
+        [width] "+r"(width)                            // %[width]
+      : [kAR30Row_BoxShifts] "r"(kAR30Row_BoxShifts),  // %[kAR30Row_BoxShifts]
+        [indices] "r"(indices)                         // %[indices]
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v20", "v21");
+}
+
+void ABGRToAR30Row_NEON(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+  ABCDToAR30Row_NEON(src_abgr, dst_ar30, width, kABGRToAR30Row_BoxIndices);
+}
+
+void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+  ABCDToAR30Row_NEON(src_argb, dst_ar30, width, kARGBToAR30Row_BoxIndices);
+}
+
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_rgb24,
                         int width) {