diff --git a/include/libyuv/row.h b/include/libyuv/row.h index aa2c69372..5645a73f3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -527,6 +527,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_ARGBTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON +#define HAS_CONVERT8TO16ROW_NEON #define HAS_I210ALPHATOARGBROW_NEON #define HAS_I410ALPHATOARGBROW_NEON #define HAS_I210TOARGBROW_NEON @@ -3786,6 +3787,14 @@ void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int scale, int width); +void Convert8To16Row_NEON(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); +void Convert8To16Row_Any_NEON(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width); void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index c2d4b67a4..7c2785cf2 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -225,6 +225,14 @@ void Convert8To16Plane(const uint8_t* src_y, } } #endif +#if defined(HAS_CONVERT8TO16ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert8To16Row = Convert8To16Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + Convert8To16Row = Convert8To16Row_NEON; + } + } +#endif // Convert plane for (y = 0; y < height; ++y) { diff --git a/source/row_any.cc b/source/row_any.cc index 5dac7a9c7..4c19d792a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1757,6 +1757,15 @@ ANY11C(Convert8To16Row_Any_AVX2, uint16_t, 31) #endif +#ifdef HAS_CONVERT8TO16ROW_NEON +ANY11C(Convert8To16Row_Any_NEON, + Convert8To16Row_NEON, + 1, + 2, + uint8_t, + uint16_t, + 15) +#endif #ifdef HAS_MULTIPLYROW_16_AVX2 ANY11C(MultiplyRow_16_Any_AVX2, MultiplyRow_16_AVX2, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index f14061dec..0fe54c830 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -5582,6 +5582,34 @@ void Convert8To8Row_NEON(const uint8_t* src_y, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 1024 = 10 bits +void Convert8To16Row_NEON(const uint8_t* src_y, + uint16_t* dst_y, + int scale, + int width) { + // (src * 0x0101 * scale) >> 16. + // Since scale is a power of two, compute the shift to use to avoid needing + // to widen to int32. + int shift = 15 - __builtin_clz(scale); + asm volatile( + "dup v2.8h, %w[shift] \n" + "1: \n" + "ldr q0, [%[src]], #16 \n" + "zip2 v1.16b, v0.16b, v0.16b \n" + "zip1 v0.16b, v0.16b, v0.16b \n" + "subs %w[width], %w[width], #16 \n" + "ushl v1.8h, v1.8h, v2.8h \n" + "ushl v0.8h, v0.8h, v2.8h \n" + "stp q0, q1, [%[dst]], #32 \n" + "b.ne 1b \n" + : [src] "+r"(src_y), // %[src] + [dst] "+r"(dst_y), // %[dst] + [width] "+r"(width) // %[width] + : [shift] "r"(shift) // %[shift] + : "cc", "memory", "v0", "v1", "v2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus