mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add Neon dot-product implementation for ARGBSepiaRow
We can use the dot product instructions to apply the coefficients directly without the need for LD4 de-interleaving load instructions, since these are known to be slow on some micro-architectures. ST4 is also known to be slow on more modern micro-architectures, however avoiding this is left for a future SVE implementation where we can make use of interleaving-narrowing instructions. Reduction in cycle counts observed compared to existing Neon code: Cortex-A55: -5.8% Cortex-A510: -18.9% Cortex-A76: -21.8% Cortex-A720: -30.2% Cortex-X1: -28.6% Cortex-X2: -23.4% Bug: b/42280946 Change-Id: I5887559649cc805a810d867b652c85d48285657d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790970 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
1c31461771
commit
432d186116
@ -578,6 +578,7 @@ extern "C" {
|
||||
#define HAS_RGBATOYJROW_NEON_DOTPROD
|
||||
#define HAS_RGBATOYROW_NEON_DOTPROD
|
||||
#define HAS_ARGBGRAYROW_NEON_DOTPROD
|
||||
#define HAS_ARGBSEPIAROW_NEON_DOTPROD
|
||||
|
||||
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||
@ -6162,6 +6163,7 @@ void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
|
||||
void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);
|
||||
|
||||
@ -3869,6 +3869,11 @@ int ARGBSepia(uint8_t* dst_argb,
|
||||
ARGBSepiaRow = ARGBSepiaRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBSEPIAROW_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
|
||||
ARGBSepiaRow = ARGBSepiaRow_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBSEPIAROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
|
||||
ARGBSepiaRow = ARGBSepiaRow_MSA;
|
||||
|
||||
@ -4265,6 +4265,47 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
|
||||
"v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
|
||||
}
|
||||
|
||||
static const uvec8 kARGBSepiaRowCoeffs = {17, 68, 35, 0, 22, 88,
|
||||
45, 0, 24, 98, 50, 0};
|
||||
static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31};
|
||||
|
||||
void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"ld3r {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n"
|
||||
"ldr d23, [%[indices]] \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[dst]] \n"
|
||||
"movi v2.4s, #0 \n"
|
||||
"movi v3.4s, #0 \n"
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
"movi v6.4s, #0 \n"
|
||||
"movi v7.4s, #0 \n"
|
||||
"udot v2.4s, v0.16b, v20.16b \n"
|
||||
"udot v3.4s, v1.16b, v20.16b \n"
|
||||
"udot v4.4s, v0.16b, v21.16b \n"
|
||||
"udot v5.4s, v1.16b, v21.16b \n"
|
||||
"udot v6.4s, v0.16b, v22.16b \n"
|
||||
"udot v7.4s, v1.16b, v22.16b \n"
|
||||
"subs %w1, %w1, #8 \n"
|
||||
"prfm pldl1keep, [%[dst], 448] \n"
|
||||
"uzp1 v6.8h, v6.8h, v7.8h \n"
|
||||
"uzp1 v5.8h, v4.8h, v5.8h \n"
|
||||
"uzp1 v4.8h, v2.8h, v3.8h \n"
|
||||
"tbl v3.16b, {v0.16b, v1.16b}, v23.16b \n"
|
||||
"uqshrn v0.8b, v4.8h, #7 \n"
|
||||
"uqshrn v1.8b, v5.8h, #7 \n"
|
||||
"uqshrn v2.8b, v6.8h, #7 \n"
|
||||
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [dst] "+r"(dst_argb), // %[dst]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [coeffs] "r"(&kARGBSepiaRowCoeffs), // %[coeffs]
|
||||
[indices] "r"(&kARGBSepiaRowAlphaIndices) // %[indices]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
|
||||
"v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
|
||||
}
|
||||
|
||||
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
|
||||
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
|
||||
// needs to saturate. Consider doing a non-saturating version.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user