Add Neon implementation of Convert8To16Row

Add a Neon implementation of the Convert8To16Row kernel. Compared to the C implementation we can take advantage of knowing that the "scale" parameter is always an unsigned power of two and fits in 16-bits, allowing us to combine this with the shift and avoid needing to widen the input data. Reduction in run times observed compared to the existing C implementation: Cortex-A55: -44.5% Cortex-A510: -26.1% Cortex-A520: -30.6% Cortex-A76: -61.6% Cortex-A710: -57.6% Cortex-X1: -46.5% Cortex-X2: -54.4% Cortex-X3: -57.1% Cortex-X4: -55.0% Cortex-X925: -49.3% Change-Id: I34b858605ece47e46588c0680a1d2afa7a90d7a0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6516186 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-07 01:06:46 +08:00 · 2025-03-18 13:42:11 +00:00 · 2025-03-18 13:42:11 +00:00 · ef9833fc70
commit ef9833fc70
parent 7e5863ae5a
4 changed files with 54 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -527,6 +527,7 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_ARGBTOAR30ROW_NEON
 #define HAS_ABGRTOAR30ROW_NEON
 #define HAS_CONVERT8TO16ROW_NEON
 #define HAS_I210ALPHATOARGBROW_NEON
 #define HAS_I410ALPHATOARGBROW_NEON
 #define HAS_I210TOARGBROW_NEON
@ -3786,6 +3787,14 @@ void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
                              uint16_t* dst_ptr,
                              int scale,
                              int width);
 void Convert8To16Row_NEON(const uint8_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width);
 void Convert8To16Row_Any_NEON(const uint8_t* src_y,
                              uint16_t* dst_y,
                              int scale,
                              int width);
 void Convert16To8Row_C(const uint16_t* src_y,
                       uint8_t* dst_y,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -225,6 +225,14 @@ void Convert8To16Plane(const uint8_t* src_y,
    }
  }
 #endif
 #if defined(HAS_CONVERT8TO16ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    Convert8To16Row = Convert8To16Row_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
      Convert8To16Row = Convert8To16Row_NEON;
    }
  }
 #endif
  // Convert plane
  for (y = 0; y < height; ++y) {
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1757,6 +1757,15 @@ ANY11C(Convert8To16Row_Any_AVX2,
       uint16_t,
       31)
 #endif
 #ifdef HAS_CONVERT8TO16ROW_NEON
 ANY11C(Convert8To16Row_Any_NEON,
       Convert8To16Row_NEON,
       1,
       2,
       uint8_t,
       uint16_t,
       15)
 #endif
 #ifdef HAS_MULTIPLYROW_16_AVX2
 ANY11C(MultiplyRow_16_Any_AVX2,
       MultiplyRow_16_AVX2,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -5582,6 +5582,34 @@ void Convert8To8Row_NEON(const uint8_t* src_y,
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
 }
 // Use scale to convert lsb formats to msb, depending how many bits there are:
 // 1024 = 10 bits
 void Convert8To16Row_NEON(const uint8_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
  // (src * 0x0101 * scale) >> 16.
  // Since scale is a power of two, compute the shift to use to avoid needing
  // to widen to int32.
  int shift = 15 - __builtin_clz(scale);
  asm volatile(
      "dup    v2.8h, %w[shift]                 \n"
      "1:                                      \n"
      "ldr    q0, [%[src]], #16                \n"
      "zip2   v1.16b, v0.16b, v0.16b           \n"
      "zip1   v0.16b, v0.16b, v0.16b           \n"
      "subs   %w[width], %w[width], #16        \n"
      "ushl   v1.8h, v1.8h, v2.8h              \n"
      "ushl   v0.8h, v0.8h, v2.8h              \n"
      "stp    q0, q1, [%[dst]], #32            \n"
      "b.ne   1b                               \n"
      : [src] "+r"(src_y),   // %[src]
        [dst] "+r"(dst_y),   // %[dst]
        [width] "+r"(width)  // %[width]
      : [shift] "r"(shift)   // %[shift]
      : "cc", "memory", "v0", "v1", "v2");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus