mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Add Neon implementation of Convert8To16Row
Add a Neon implementation of the Convert8To16Row kernel. Compared to the C implementation we can take advantage of knowing that the "scale" parameter is always an unsigned power of two and fits in 16-bits, allowing us to combine this with the shift and avoid needing to widen the input data. Reduction in run times observed compared to the existing C implementation: Cortex-A55: -44.5% Cortex-A510: -26.1% Cortex-A520: -30.6% Cortex-A76: -61.6% Cortex-A710: -57.6% Cortex-X1: -46.5% Cortex-X2: -54.4% Cortex-X3: -57.1% Cortex-X4: -55.0% Cortex-X925: -49.3% Change-Id: I34b858605ece47e46588c0680a1d2afa7a90d7a0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6516186 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
7e5863ae5a
commit
ef9833fc70
@ -527,6 +527,7 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
#define HAS_ARGBTOAR30ROW_NEON
|
#define HAS_ARGBTOAR30ROW_NEON
|
||||||
#define HAS_ABGRTOAR30ROW_NEON
|
#define HAS_ABGRTOAR30ROW_NEON
|
||||||
|
#define HAS_CONVERT8TO16ROW_NEON
|
||||||
#define HAS_I210ALPHATOARGBROW_NEON
|
#define HAS_I210ALPHATOARGBROW_NEON
|
||||||
#define HAS_I410ALPHATOARGBROW_NEON
|
#define HAS_I410ALPHATOARGBROW_NEON
|
||||||
#define HAS_I210TOARGBROW_NEON
|
#define HAS_I210TOARGBROW_NEON
|
||||||
@ -3786,6 +3787,14 @@ void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int scale,
|
int scale,
|
||||||
int width);
|
int width);
|
||||||
|
void Convert8To16Row_NEON(const uint8_t* src_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width);
|
||||||
|
void Convert8To16Row_Any_NEON(const uint8_t* src_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width);
|
||||||
|
|
||||||
void Convert16To8Row_C(const uint16_t* src_y,
|
void Convert16To8Row_C(const uint16_t* src_y,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
|
|||||||
@ -225,6 +225,14 @@ void Convert8To16Plane(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_CONVERT8TO16ROW_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
Convert8To16Row = Convert8To16Row_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
Convert8To16Row = Convert8To16Row_NEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Convert plane
|
// Convert plane
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
|
|||||||
@ -1757,6 +1757,15 @@ ANY11C(Convert8To16Row_Any_AVX2,
|
|||||||
uint16_t,
|
uint16_t,
|
||||||
31)
|
31)
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAS_CONVERT8TO16ROW_NEON
|
||||||
|
ANY11C(Convert8To16Row_Any_NEON,
|
||||||
|
Convert8To16Row_NEON,
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
uint8_t,
|
||||||
|
uint16_t,
|
||||||
|
15)
|
||||||
|
#endif
|
||||||
#ifdef HAS_MULTIPLYROW_16_AVX2
|
#ifdef HAS_MULTIPLYROW_16_AVX2
|
||||||
ANY11C(MultiplyRow_16_Any_AVX2,
|
ANY11C(MultiplyRow_16_Any_AVX2,
|
||||||
MultiplyRow_16_AVX2,
|
MultiplyRow_16_AVX2,
|
||||||
|
|||||||
@ -5582,6 +5582,34 @@ void Convert8To8Row_NEON(const uint8_t* src_y,
|
|||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||||
|
// 1024 = 10 bits
|
||||||
|
void Convert8To16Row_NEON(const uint8_t* src_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width) {
|
||||||
|
// (src * 0x0101 * scale) >> 16.
|
||||||
|
// Since scale is a power of two, compute the shift to use to avoid needing
|
||||||
|
// to widen to int32.
|
||||||
|
int shift = 15 - __builtin_clz(scale);
|
||||||
|
asm volatile(
|
||||||
|
"dup v2.8h, %w[shift] \n"
|
||||||
|
"1: \n"
|
||||||
|
"ldr q0, [%[src]], #16 \n"
|
||||||
|
"zip2 v1.16b, v0.16b, v0.16b \n"
|
||||||
|
"zip1 v0.16b, v0.16b, v0.16b \n"
|
||||||
|
"subs %w[width], %w[width], #16 \n"
|
||||||
|
"ushl v1.8h, v1.8h, v2.8h \n"
|
||||||
|
"ushl v0.8h, v0.8h, v2.8h \n"
|
||||||
|
"stp q0, q1, [%[dst]], #32 \n"
|
||||||
|
"b.ne 1b \n"
|
||||||
|
: [src] "+r"(src_y), // %[src]
|
||||||
|
[dst] "+r"(dst_y), // %[dst]
|
||||||
|
[width] "+r"(width) // %[width]
|
||||||
|
: [shift] "r"(shift) // %[shift]
|
||||||
|
: "cc", "memory", "v0", "v1", "v2");
|
||||||
|
}
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user