[AArch64] Add I8MM implementation of ARGBToUV444Row

We cannot use the standard dot-product instructions since the coefficients multiplication results are both added and subtracted, but I8MM supports mixed-sign dot products which work well here. We need to add an additional variant of the coefficient structs since we need negative constants for the elements that were previously subtracted. Reduction in runtimes observed compared to the previous Neon implementation: Cortex-A510: -37.3% Cortex-A520: -31.1% Cortex-A715: -37.1% Cortex-A720: -37.0% Cortex-X2: -62.1% Cortex-X3: -62.2% Cortex-X4: -40.4% Bug: libyuv:977 Change-Id: Idc3d9a6408c30e1bce3816a1ed926ecd76792236 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5712928 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2025-12-06 16:56:55 +08:00 · 2024-04-18 16:18:10 +01:00 · 2024-04-18 16:18:10 +01:00 · a4ccf9940e
commit a4ccf9940e
parent 302d29d1a8
4 changed files with 79 additions and 6 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -578,6 +578,7 @@ extern "C" {
 #define HAS_RGBATOYROW_NEON_DOTPROD

 #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
+#define HAS_ARGBTOUV444ROW_NEON_I8MM
 #endif

 // The following are available on AArch64 SVE platforms:
@ -1611,6 +1612,10 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
+void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
 void ARGBToUVRow_NEON(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
@ -2118,6 +2123,10 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
+                                  uint8_t* dst_u,
+                                  uint8_t* dst_v,
+                                  int width);
 void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
                          int src_stride,
                          uint8_t* dst_u,
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
    }
  }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
+  if (TestCpuFlag(kCpuHasNeonI8MM)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -2116,6 +2116,9 @@ ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM
+ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7)
+#endif
 #ifdef HAS_YUY2TOUV422ROW_MSA
 ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
 ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2711,17 +2711,22 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
  );
 }

-struct RgbUVConstants {
+struct RgbUVConstantsU8 {
  uint8_t kRGBToU[4];
  uint8_t kRGBToV[4];
 };

+struct RgbUVConstantsI8 {
+  int8_t kRGBToU[4];
+  int8_t kRGBToV[4];
+};
+
 // 8x1 pixels.
 void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width,
-                               const struct RgbUVConstants* rgbuvconstants) {
+                               const struct RgbUVConstantsU8* rgbuvconstants) {
  asm volatile(
      "ldr         d0, [%4]                      \n"  // load rgbuvconstants
      "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
@ -2758,6 +2763,42 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
        "v27", "v28", "v29");
 }

+void ARGBToUV444MatrixRow_NEON_I8MM(
+    const uint8_t* src_argb,
+    uint8_t* dst_u,
+    uint8_t* dst_v,
+    int width,
+    const struct RgbUVConstantsI8* rgbuvconstants) {
+  asm("ld2r        {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
+      "movi        v29.16b, #0x80                \n"  // 128.5
+      "1:                                        \n"
+      "ldp         q0, q1, [%[src]], #32         \n"
+      "movi        v2.4s, #0                     \n"
+      "movi        v3.4s, #0                     \n"
+      "movi        v4.4s, #0                     \n"
+      "movi        v5.4s, #0                     \n"
+      "usdot       v2.4s, v0.16b, v16.16b        \n"
+      "usdot       v3.4s, v1.16b, v16.16b        \n"
+      "usdot       v4.4s, v0.16b, v17.16b        \n"
+      "usdot       v5.4s, v1.16b, v17.16b        \n"
+      "prfm        pldl1keep, [%[src], 448]      \n"
+      "subs        %w[width], %w[width], #8      \n"  // 8 processed per loop.
+      "uzp1        v0.8h, v2.8h, v3.8h           \n"
+      "uzp1        v1.8h, v4.8h, v5.8h           \n"
+      "addhn       v0.8b, v0.8h, v29.8h          \n"  // +128 -> unsigned
+      "addhn       v1.8b, v1.8h, v29.8h          \n"  // +128 -> unsigned
+      "str         d0, [%[dst_u]], #8            \n"  // store 8 pixels U.
+      "str         d1, [%[dst_v]], #8            \n"  // store 8 pixels V.
+      "b.gt        1b                            \n"
+      : [src] "+r"(src_argb),                 // %[src]
+        [dst_u] "+r"(dst_u),                  // %[dst_u]
+        [dst_v] "+r"(dst_v),                  // %[dst_v]
+        [width] "+r"(width)                   // %[width]
+      : [rgbuvconstants] "r"(rgbuvconstants)  // %[rgbuvconstants]
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
+        "v29");
+}
+
 // RGB to bt601 coefficients
 // UB   0.875 coefficient = 112
 // UG -0.5781 coefficient = 74
@ -2766,15 +2807,27 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
 // VG -0.7344 coefficient = 94
 // VR   0.875 coefficient = 112 (ignored)

-static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
+static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = {
+    {112, 74, 38, 0},
    {18, 94, 112, 0}};
+static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = {
+    {112, -74, -38, 0},
+    {-18, -94, 112, 0}};

 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width) {
  ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
-                            &kRgb24I601UVConstants);
+                            &kRgb24I601UVConstantsU8);
+}
+
+void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width) {
+  ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
+                                 &kRgb24I601UVConstantsI8);
 }

 #define RGBTOUV_SETUP_REG                                                  \