From d1ec694ad38f9c5f1cc07e8db7b4157a44a14ae8 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Wed, 10 Apr 2024 16:36:26 +0100
Subject: [PATCH] [AArch64] Add P{210,410}To{ARGB,AR30}Row_NEON

There are existing x86 implementations for these kernels, but not for
AArch64, so add them.

Reduction in runtimes, compared to the existing C code compiled with
LLVM 17:

              | Cortex-A55 | Cortex-A510 | Cortex-A76
P210ToARGBRow |     -59.8% |      -16.8% |     -53.2%
P210ToAR30Row |     -48.1% |      -21.8% |     -54.0%
P410ToARGBRow |     -56.5% |      -32.2% |     -54.1%
P410ToAR30Row |     -42.4% |       -4.5% |     -50.4%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: libyuv:976
Change-Id: I24a5addd2c54c7fdfb9717e2a45ae5acd43d6e96
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5607764
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h   |  44 ++++++++++++++
 source/convert_argb.cc |  64 ++++++++++++++++++++
 source/row_any.cc      |  12 ++++
 source/row_neon64.cc   | 129 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 247 insertions(+), 2 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index e526eac1f..86b024321 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -564,6 +564,10 @@ extern "C" {
 #define HAS_I212TOARGBROW_NEON
 #define HAS_I212TOAR30ROW_NEON
 #define HAS_I422TOAR30ROW_NEON
+#define HAS_P210TOAR30ROW_NEON
+#define HAS_P210TOARGBROW_NEON
+#define HAS_P410TOAR30ROW_NEON
+#define HAS_P410TOARGBROW_NEON
 
 #define HAS_ABGRTOYJROW_NEON_DOTPROD
 #define HAS_ABGRTOYROW_NEON_DOTPROD
@@ -5325,6 +5329,46 @@ void ABGRToAR30Row_Any_NEON(const uint8_t* src_ptr,
 void ARGBToAR30Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
+void P210ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToARGBRow_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P410ToAR30Row_NEON(const uint16_t* y_buf,
+                        const uint16_t* uv_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToARGBRow_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P210ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void P410ToAR30Row_Any_NEON(const uint16_t* y_buf,
+                            const uint16_t* uv_buf,
+                            uint8_t* dst_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
 void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
                            const uint8_t* u_buf,
                            const uint8_t* v_buf,
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index c70982dc2..11948726a 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -1891,6 +1891,14 @@ int P010ToARGBMatrix(const uint16_t* src_y,
       P210ToARGBRow = P210ToARGBRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_P210TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P210ToARGBRow = P210ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_NEON;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -1942,6 +1950,14 @@ int P210ToARGBMatrix(const uint16_t* src_y,
       P210ToARGBRow = P210ToARGBRow_AVX2;
     }
   }
+#endif
+#if defined(HAS_P210TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P210ToARGBRow = P210ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToARGBRow = P210ToARGBRow_NEON;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@@ -1991,6 +2007,14 @@ int P010ToAR30Matrix(const uint16_t* src_y,
       P210ToAR30Row = P210ToAR30Row_AVX2;
     }
   }
+#endif
+#if defined(HAS_P210TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P210ToAR30Row = P210ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_NEON;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
@@ -2042,6 +2066,14 @@ int P210ToAR30Matrix(const uint16_t* src_y,
       P210ToAR30Row = P210ToAR30Row_AVX2;
     }
   }
+#endif
+#if defined(HAS_P210TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P210ToAR30Row = P210ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P210ToAR30Row = P210ToAR30Row_NEON;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
@@ -7931,6 +7963,14 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_P410TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P410ToARGBRow = P410ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToARGBRow = P410ToARGBRow_NEON;
+    }
+  }
+#endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
@@ -8024,6 +8064,14 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_P410TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P410ToARGBRow = P410ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToARGBRow = P410ToARGBRow_NEON;
+    }
+  }
+#endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
@@ -8103,6 +8151,14 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_P410TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P410ToAR30Row = P410ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToAR30Row = P410ToAR30Row_NEON;
+    }
+  }
+#endif
 
 #ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
@@ -8196,6 +8252,14 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_P410TOAR30ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    P410ToAR30Row = P410ToAR30Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      P410ToAR30Row = P410ToAR30Row_NEON;
+    }
+  }
+#endif
 
 #ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
   if (TestCpuFlag(kCpuHasSSE41)) {
diff --git a/source/row_any.cc b/source/row_any.cc
index 9a3af5e6b..1fc4d958b 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -897,6 +897,12 @@ ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #ifdef HAS_P210TOAR30ROW_AVX2
 ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
+#ifdef HAS_P210TOAR30ROW_NEON
+ANY21CT(P210ToAR30Row_Any_NEON, P210ToAR30Row_NEON, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P210TOARGBROW_NEON
+ANY21CT(P210ToARGBRow_Any_NEON, P210ToARGBRow_NEON, 1, 0, uint16_t, 2, 4, 7)
+#endif
 #ifdef HAS_P410TOAR30ROW_SSSE3
 ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7)
 #endif
@@ -909,6 +915,12 @@ ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
 #ifdef HAS_P410TOAR30ROW_AVX2
 ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
 #endif
+#ifdef HAS_P410TOAR30ROW_NEON
+ANY21CT(P410ToAR30Row_Any_NEON, P410ToAR30Row_NEON, 0, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_P410TOARGBROW_NEON
+ANY21CT(P410ToARGBRow_Any_NEON, P410ToARGBRow_NEON, 0, 0, uint16_t, 2, 4, 7)
+#endif
 
 #undef ANY21CT
 
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index bbe33842c..ba12ac9c0 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -52,6 +52,13 @@ extern "C" {
   "prfm       pldl1keep, [%[src_u], 128]     \n" \
   "prfm       pldl1keep, [%[src_v], 128]     \n"
 
+// Read 8 Y, 4 U and 4 V interleaved from 210
+#define READYUVP210                              \
+  "ldr        q0, [%[src_y]], #16            \n" \
+  "ldr        q1, [%[src_uv]], #16           \n" \
+  "prfm       pldl1keep, [%[src_y], 448]     \n" \
+  "tbl        v1.16b, {v1.16b}, v2.16b       \n"
+
 // Read 8 Y, 4 U and 4 V from 212
 #define READYUV212                               \
   "ldr        q2, [%[src_y]], #16            \n" \
@@ -80,6 +87,13 @@ extern "C" {
   "prfm       pldl1keep, [%[src_u], 128]     \n" \
   "prfm       pldl1keep, [%[src_v], 128]     \n"
 
+// Read 8 Y, 8 U and 8 V interleaved from 410
+#define READYUVP410                                \
+  "ldr        q0, [%[src_y]], #16              \n" \
+  "ldp        q4, q5, [%[src_uv]], #32         \n" \
+  "prfm       pldl1keep, [%[src_y], 448]       \n" \
+  "tbl        v1.16b, {v4.16b, v5.16b}, v2.16b \n"
+
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                               \
   "ldr        d0, [%[src_y]], #8             \n" \
@@ -208,9 +222,9 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9,  9,  13, 13,
    */                                                                     \
   "uqshl    v0.8h, v16.8h, #2                  \n" /* bbbbbbbbbbxxxxxx */ \
   "uqshl    v1.8h, v17.8h, #2                  \n" /* ggggggggggxxxxxx */ \
-  "umin     v2.8h, v18.8h, v22.8h              \n" /* 00rrrrrrrrrrxxxx */ \
+  "umin     v6.8h, v18.8h, v22.8h              \n" /* 00rrrrrrrrrrxxxx */ \
   "shl      v4.8h, v1.8h, #4                   \n" /* ggggggxxxxxx0000 */ \
-  "orr      v5.16b, v2.16b, v23.16b            \n" /* 11rrrrrrrrrrxxxx */ \
+  "orr      v5.16b, v6.16b, v23.16b            \n" /* 11rrrrrrrrrrxxxx */ \
   "sri      v4.8h, v0.8h, #6                   \n" /* ggggggbbbbbbbbbb */ \
   "sri      v5.8h, v1.8h, #12                  \n" /* 11rrrrrrrrrrgggg */ \
   "st2      {v4.8h, v5.8h}, [%[dst_ar30]], #32 \n"
@@ -442,6 +456,117 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
       : "cc", "memory", YUVTORGB_REGS, "v19");
 }
 
+uint8_t kP210LoadShuffleIndices[] = {1, 1, 5, 5, 9,  9,  13, 13,
+                                     3, 3, 7, 7, 11, 11, 15, 15};
+
+void P210ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi         v19.8b, #255                       \n"
+      "ldr          q2, [%[kIndices]]                  \n"
+      "1:                                              \n"  //
+      READYUVP210 NVTORGB RGBTORGB8
+      "subs         %w[width], %w[width], #8           \n"
+      "st4          {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt         1b                                 \n"
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_argb] "+r"(dst_argb),               // %[dst_argb]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+uint8_t kP410LoadShuffleIndices[] = {1, 5, 9,  13, 17, 21, 25, 29,
+                                     3, 7, 11, 15, 19, 23, 27, 31};
+
+void P410ToARGBRow_NEON(const uint16_t* src_y,
+                        const uint16_t* src_uv,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi         v19.8b, #255               \n"
+      "ldr          q2, [%[kIndices]]   \n"
+      "1:                                      \n"  //
+      READYUVP410 NVTORGB RGBTORGB8
+      "subs         %w[width], %w[width], #8   \n"
+      "st4          {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
+      "b.gt         1b                         \n"
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_argb] "+r"(dst_argb),               // %[dst_argb]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v19");
+}
+
+void P210ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_uv,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  const uint16_t limit = 0x3ff0;
+  asm volatile(YUVTORGB_SETUP
+               "dup          v22.8h, %w[limit]          \n"
+               "movi         v23.8h, #0xc0, lsl #8      \n"  // A
+               "ldr          q2, [%[kIndices]]          \n"
+               "1:                                      \n" READYUVP210 NVTORGB
+               "subs         %w[width], %w[width], #8   \n" STOREAR30
+               "b.gt         1b                         \n"
+               : [src_y] "+r"(src_y),                     // %[src_y]
+                 [src_uv] "+r"(src_uv),                   // %[src_uv]
+                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+                 [width] "+r"(width)                      // %[width]
+               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+                 [limit] "r"(limit),                      // %[limit]
+                 [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+}
+
+void P410ToAR30Row_NEON(const uint16_t* src_y,
+                        const uint16_t* src_uv,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
+  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
+  uint16_t limit = 0x3ff0;
+  uint16_t alpha = 0xc000;
+  asm volatile(YUVTORGB_SETUP
+               "dup          v22.8h, %w[limit]          \n"
+               "movi         v23.8h, #0xc0, lsl #8      \n"  // A
+               "ldr          q2, [%[kIndices]]          \n"
+               "1:                                      \n" READYUVP410 NVTORGB
+               "subs         %w[width], %w[width], #8   \n" STOREAR30
+               "b.gt         1b                         \n"
+               : [src_y] "+r"(src_y),                     // %[src_y]
+                 [src_uv] "+r"(src_uv),                   // %[src_uv]
+                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+                 [width] "+r"(width)                      // %[width]
+               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+                 [limit] "r"(limit),                      // %[limit]
+                 [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+}
+
 void I422ToAR30Row_NEON(const uint8_t* src_y,
                         const uint8_t* src_u,
                         const uint8_t* src_v,