mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Add SVE2 implementation of DivideRow_16
SVE contains the UMULH instruction which allows us to multiply and take the high half of the result in a single instruction rather than needing separate widening multiply and then narrowing shift steps. Observed reduction in runtime compared to the existing Neon code: Cortex-A510: -21.2% Cortex-A520: -20.9% Cortex-A715: -47.9% Cortex-A720: -47.6% Cortex-X2: -5.2% Cortex-X3: -2.6% Cortex-X4: -32.4% Cortex-X925: -1.5% Bug: b/42280942 Change-Id: I25154699b17772db1fb5cb84c049919181d86f4b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5975318 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
aec4b4e22e
commit
f27b983f38
@ -548,6 +548,7 @@ extern "C" {
|
|||||||
#define HAS_AYUVTOUVROW_SVE2
|
#define HAS_AYUVTOUVROW_SVE2
|
||||||
#define HAS_AYUVTOVUROW_SVE2
|
#define HAS_AYUVTOVUROW_SVE2
|
||||||
#define HAS_BGRATOUVROW_SVE2
|
#define HAS_BGRATOUVROW_SVE2
|
||||||
|
#define HAS_DIVIDEROW_16_SVE2
|
||||||
#define HAS_I400TOARGBROW_SVE2
|
#define HAS_I400TOARGBROW_SVE2
|
||||||
#define HAS_I422ALPHATOARGBROW_SVE2
|
#define HAS_I422ALPHATOARGBROW_SVE2
|
||||||
#define HAS_I422TOARGB1555ROW_SVE2
|
#define HAS_I422TOARGB1555ROW_SVE2
|
||||||
@ -3302,6 +3303,10 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
|||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width);
|
int width);
|
||||||
|
void DivideRow_16_SVE2(const uint16_t* src_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width);
|
||||||
void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
|
void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int scale,
|
int scale,
|
||||||
|
|||||||
@ -877,6 +877,11 @@ void ConvertToLSBPlane_16(const uint16_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_DIVIDEROW_16_SVE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||||
|
DivideRow = DivideRow_16_SVE2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
DivideRow(src_y, dst_y, scale, width);
|
DivideRow(src_y, dst_y, scale, width);
|
||||||
|
|||||||
@ -1620,6 +1620,56 @@ void ARGBToRAWRow_SVE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
|
|||||||
ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices);
|
ARGBToXYZRow_SVE2(src_argb, dst_rgb, width, kARGBToRAWRowIndices);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DivideRow_16_SVE2(const uint16_t* src_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int scale,
|
||||||
|
int width) {
|
||||||
|
uint64_t vl;
|
||||||
|
asm volatile(
|
||||||
|
"cnth %x[vl] \n"
|
||||||
|
"dup z0.h, %w[scale] \n"
|
||||||
|
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||||
|
"b.le 2f \n"
|
||||||
|
|
||||||
|
// Run bulk of computation with the same predicates to avoid predicate
|
||||||
|
// generation overhead.
|
||||||
|
"ptrue p0.h \n"
|
||||||
|
"1: \n"
|
||||||
|
"ld1h {z1.h}, p0/z, [%[src]] \n"
|
||||||
|
"ld1h {z2.h}, p0/z, [%[src], #1, mul vl] \n"
|
||||||
|
"incb %[src], all, mul #2 \n"
|
||||||
|
"umulh z1.h, z1.h, z0.h \n"
|
||||||
|
"umulh z2.h, z2.h, z0.h \n"
|
||||||
|
"subs %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||||
|
"st1h {z1.h}, p0, [%[dst]] \n"
|
||||||
|
"st1h {z2.h}, p0, [%[dst], #1, mul vl] \n"
|
||||||
|
"incb %[dst], all, mul #2 \n"
|
||||||
|
"b.gt 1b \n"
|
||||||
|
|
||||||
|
"2: \n"
|
||||||
|
"adds %w[width], %w[width], %w[vl], lsl #1 \n"
|
||||||
|
"b.eq 99f \n"
|
||||||
|
|
||||||
|
// Calculate a pair of predicates for the final iteration to deal with
|
||||||
|
// the tail.
|
||||||
|
"whilelt p0.h, wzr, %w[width] \n"
|
||||||
|
"whilelt p1.h, %w[vl], %w[width] \n"
|
||||||
|
"ld1h {z1.h}, p0/z, [%[src]] \n"
|
||||||
|
"ld1h {z2.h}, p1/z, [%[src], #1, mul vl] \n"
|
||||||
|
"umulh z1.h, z1.h, z0.h \n"
|
||||||
|
"umulh z2.h, z2.h, z0.h \n"
|
||||||
|
"st1h {z1.h}, p0, [%[dst]] \n"
|
||||||
|
"st1h {z2.h}, p1, [%[dst], #1, mul vl] \n"
|
||||||
|
|
||||||
|
"99: \n"
|
||||||
|
: [src] "+r"(src_y), // %[src]
|
||||||
|
[dst] "+r"(dst_y), // %[dst]
|
||||||
|
[width] "+r"(width), // %[width]
|
||||||
|
[vl] "=&r"(vl) // %[vl]
|
||||||
|
: [scale] "r"(scale) // %[scale]
|
||||||
|
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
|
||||||
|
}
|
||||||
|
|
||||||
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
|
#endif // !defined(LIBYUV_DISABLE_SVE) && defined(__aarch64__)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user