Add Neon implementation of Convert8To16Row

Add a Neon implementation of the Convert8To16Row kernel. Compared to the
C implementation we can take advantage of knowing that the "scale"
parameter is always an unsigned power of two and fits in 16-bits,
allowing us to combine this with the shift and avoid needing to widen
the input data.

Reduction in run times observed compared to the existing C
implementation:

 Cortex-A55: -44.5%
Cortex-A510: -26.1%
Cortex-A520: -30.6%
 Cortex-A76: -61.6%
Cortex-A710: -57.6%
  Cortex-X1: -46.5%
  Cortex-X2: -54.4%
  Cortex-X3: -57.1%
  Cortex-X4: -55.0%
Cortex-X925: -49.3%

Change-Id: I34b858605ece47e46588c0680a1d2afa7a90d7a0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6516186
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2025-03-18 13:42:11 +00:00 committed by Frank Barchard
parent 7e5863ae5a
commit ef9833fc70
4 changed files with 54 additions and 0 deletions

View File

@ -527,6 +527,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_ARGBTOAR30ROW_NEON #define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON
#define HAS_CONVERT8TO16ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON #define HAS_I210ALPHATOARGBROW_NEON
#define HAS_I410ALPHATOARGBROW_NEON #define HAS_I410ALPHATOARGBROW_NEON
#define HAS_I210TOARGBROW_NEON #define HAS_I210TOARGBROW_NEON
@ -3786,6 +3787,14 @@ void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int scale, int scale,
int width); int width);
void Convert8To16Row_NEON(const uint8_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void Convert8To16Row_Any_NEON(const uint8_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void Convert16To8Row_C(const uint16_t* src_y, void Convert16To8Row_C(const uint16_t* src_y,
uint8_t* dst_y, uint8_t* dst_y,

View File

@ -225,6 +225,14 @@ void Convert8To16Plane(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_CONVERT8TO16ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Convert8To16Row = Convert8To16Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
Convert8To16Row = Convert8To16Row_NEON;
}
}
#endif
// Convert plane // Convert plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {

View File

@ -1757,6 +1757,15 @@ ANY11C(Convert8To16Row_Any_AVX2,
uint16_t, uint16_t,
31) 31)
#endif #endif
#ifdef HAS_CONVERT8TO16ROW_NEON
ANY11C(Convert8To16Row_Any_NEON,
Convert8To16Row_NEON,
1,
2,
uint8_t,
uint16_t,
15)
#endif
#ifdef HAS_MULTIPLYROW_16_AVX2 #ifdef HAS_MULTIPLYROW_16_AVX2
ANY11C(MultiplyRow_16_Any_AVX2, ANY11C(MultiplyRow_16_Any_AVX2,
MultiplyRow_16_AVX2, MultiplyRow_16_AVX2,

View File

@ -5582,6 +5582,34 @@ void Convert8To8Row_NEON(const uint8_t* src_y,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
} }
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 1024 = 10 bits
void Convert8To16Row_NEON(const uint8_t* src_y,
uint16_t* dst_y,
int scale,
int width) {
// (src * 0x0101 * scale) >> 16.
// Since scale is a power of two, compute the shift to use to avoid needing
// to widen to int32.
int shift = 15 - __builtin_clz(scale);
asm volatile(
"dup v2.8h, %w[shift] \n"
"1: \n"
"ldr q0, [%[src]], #16 \n"
"zip2 v1.16b, v0.16b, v0.16b \n"
"zip1 v0.16b, v0.16b, v0.16b \n"
"subs %w[width], %w[width], #16 \n"
"ushl v1.8h, v1.8h, v2.8h \n"
"ushl v0.8h, v0.8h, v2.8h \n"
"stp q0, q1, [%[dst]], #32 \n"
"b.ne 1b \n"
: [src] "+r"(src_y), // %[src]
[dst] "+r"(dst_y), // %[dst]
[width] "+r"(width) // %[width]
: [shift] "r"(shift) // %[shift]
: "cc", "memory", "v0", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus