[AArch64] Add Neon dot-product implementation for ARGBSepiaRow

We can use the dot product instructions to apply the coefficients
directly without the need for LD4 de-interleaving load instructions,
since these are known to be slow on some micro-architectures.

ST4 is also known to be slow on more modern micro-architectures, however
avoiding this is left for a future SVE implementation where we can make
use of interleaving-narrowing instructions.

Reduction in cycle counts observed compared to existing Neon code:

 Cortex-A55:  -5.8%
Cortex-A510: -18.9%
 Cortex-A76: -21.8%
Cortex-A720: -30.2%
  Cortex-X1: -28.6%
  Cortex-X2: -23.4%

Bug: b/42280946
Change-Id: I5887559649cc805a810d867b652c85d48285657d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5790970
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-18 14:01:21 +01:00 committed by Frank Barchard
parent 1c31461771
commit 432d186116
3 changed files with 48 additions and 0 deletions

View File

@ -578,6 +578,7 @@ extern "C" {
#define HAS_RGBATOYJROW_NEON_DOTPROD
#define HAS_RGBATOYROW_NEON_DOTPROD
#define HAS_ARGBGRAYROW_NEON_DOTPROD
#define HAS_ARGBSEPIAROW_NEON_DOTPROD
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUV444ROW_NEON_I8MM
@ -6162,6 +6163,7 @@ void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width);
void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
void ARGBSepiaRow_LSX(uint8_t* dst_argb, int width);
void ARGBSepiaRow_LASX(uint8_t* dst_argb, int width);

View File

@ -3869,6 +3869,11 @@ int ARGBSepia(uint8_t* dst_argb,
ARGBSepiaRow = ARGBSepiaRow_NEON;
}
#endif
#if defined(HAS_ARGBSEPIAROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_NEON_DotProd;
}
#endif
#if defined(HAS_ARGBSEPIAROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_MSA;

View File

@ -4265,6 +4265,47 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
}
static const uvec8 kARGBSepiaRowCoeffs = {17, 68, 35, 0, 22, 88,
45, 0, 24, 98, 50, 0};
static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31};
void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
asm volatile(
"ld3r {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n"
"ldr d23, [%[indices]] \n"
"1: \n"
"ldp q0, q1, [%[dst]] \n"
"movi v2.4s, #0 \n"
"movi v3.4s, #0 \n"
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"movi v6.4s, #0 \n"
"movi v7.4s, #0 \n"
"udot v2.4s, v0.16b, v20.16b \n"
"udot v3.4s, v1.16b, v20.16b \n"
"udot v4.4s, v0.16b, v21.16b \n"
"udot v5.4s, v1.16b, v21.16b \n"
"udot v6.4s, v0.16b, v22.16b \n"
"udot v7.4s, v1.16b, v22.16b \n"
"subs %w1, %w1, #8 \n"
"prfm pldl1keep, [%[dst], 448] \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"uzp1 v5.8h, v4.8h, v5.8h \n"
"uzp1 v4.8h, v2.8h, v3.8h \n"
"tbl v3.16b, {v0.16b, v1.16b}, v23.16b \n"
"uqshrn v0.8b, v4.8h, #7 \n"
"uqshrn v1.8b, v5.8h, #7 \n"
"uqshrn v2.8b, v6.8h, #7 \n"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n"
"b.gt 1b \n"
: [dst] "+r"(dst_argb), // %[dst]
[width] "+r"(width) // %[width]
: [coeffs] "r"(&kARGBSepiaRowCoeffs), // %[coeffs]
[indices] "r"(&kARGBSepiaRowAlphaIndices) // %[indices]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
}
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.