[AArch64] Add SVE2 implementation of DivideRow_16

SVE contains the UMULH instruction which allows us to multiply and take the high half of the result in a single instruction rather than needing separate widening multiply and then narrowing shift steps. Observed reduction in runtime compared to the existing Neon code: Cortex-A510: -21.2% Cortex-A520: -20.9% Cortex-A715: -47.9% Cortex-A720: -47.6% Cortex-X2: -5.2% Cortex-X3: -2.6% Cortex-X4: -32.4% Cortex-X925: -1.5% Bug: b/42280942 Change-Id: I25154699b17772db1fb5cb84c049919181d86f4b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975318 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2024-05-05 20:51:51 +01:00 · 2024-05-05 20:51:51 +01:00 · f27b983f38
commit f27b983f38
parent aec4b4e22e
3 changed files with 60 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -548,6 +548,7 @@ extern "C" {
 #define HAS_AYUVTOUVROW_SVE2
 #define HAS_AYUVTOVUROW_SVE2
 #define HAS_BGRATOUVROW_SVE2
 #define HAS_DIVIDEROW_16_SVE2
 #define HAS_I400TOARGBROW_SVE2
 #define HAS_I422ALPHATOARGBROW_SVE2
 #define HAS_I422TOARGB1555ROW_SVE2
@ -3302,6 +3303,10 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width);
 void DivideRow_16_SVE2(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width);
 void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
                           uint16_t* dst_ptr,
                           int scale,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -877,6 +877,11 @@ void ConvertToLSBPlane_16(const uint16_t* src_y,
    }
  }
 #endif
 #if defined(HAS_DIVIDEROW_16_SVE2)
  if (TestCpuFlag(kCpuHasSVE2)) {
    DivideRow = DivideRow_16_SVE2;
  }
 #endif
  for (y = 0; y < height; ++y) {
    DivideRow(src_y, dst_y, scale, width);
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@ -1620,6 +1620,56 @@ void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
  ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices);
 }
 void DivideRow_16_SVE2(const uint16_t* src_y,
                       uint16_t* dst_y,
                       int scale,
                       int width) {
  uint64_t vl;
  asm volatile(
      "cnth   %x[vl]                                     \n"
      "dup    z0.h, %w[scale]                            \n"
      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
      "b.le    2f                                        \n"
      // Run bulk of computation with the same predicates to avoid predicate
      // generation overhead.
      "ptrue   p0.h                                      \n"
      "1:                                                \n"
      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
      "ld1h   {z2.h}, p0/z, [%[src], #1, mul vl]         \n"
      "incb   %[src], all, mul #2                        \n"
      "umulh  z1.h, z1.h, z0.h                           \n"
      "umulh  z2.h, z2.h, z0.h                           \n"
      "subs   %w[width], %w[width], %w[vl], lsl #1       \n"
      "st1h   {z1.h}, p0, [%[dst]]                       \n"
      "st1h   {z2.h}, p0, [%[dst], #1, mul vl]           \n"
      "incb   %[dst], all, mul #2                        \n"
      "b.gt    1b                                        \n"
      "2:                                                \n"
      "adds     %w[width], %w[width], %w[vl], lsl #1     \n"
      "b.eq     99f                                      \n"
      // Calculate a pair of predicates for the final iteration to deal with
      // the tail.
      "whilelt p0.h, wzr, %w[width]                      \n"
      "whilelt p1.h, %w[vl], %w[width]                   \n"
      "ld1h   {z1.h}, p0/z, [%[src]]                     \n"
      "ld1h   {z2.h}, p1/z, [%[src], #1, mul vl]         \n"
      "umulh  z1.h, z1.h, z0.h                           \n"
      "umulh  z2.h, z2.h, z0.h                           \n"
      "st1h   {z1.h}, p0, [%[dst]]                       \n"
      "st1h   {z2.h}, p1, [%[dst], #1, mul vl]           \n"
      "99:                                               \n"
      : [src] "+r"(src_y),    // %[src]
        [dst] "+r"(dst_y),    // %[dst]
        [width] "+r"(width),  // %[width]
        [vl] "=&r"(vl)        // %[vl]
      : [scale] "r"(scale)    // %[scale]
      : "cc", "memory", "z0", "z1", "z2", "p0", "p1");
 }
 #endif  // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
 #ifdef __cplusplus