[AArch64] Add Neon dot-product implementation for ARGBGrayRow

We can use dot product instructions to apply the coefficients without
needing to use LD4 deinterleaving load instructions, and then TBL to mix
in the original alpha component. This is significantly faster on some
micro-architectures where LD4 instructions are known to be slow compared
to normal loads.

Reduction in cycle counts observed compared to existing Neon code:

 Cortex-A55: -12.6%
Cortex-A510: -48.6%
 Cortex-A76: -39.7%
Cortex-A720: -52.3%
  Cortex-X1: -63.5%
  Cortex-X2: -67.0%

Bug: b/42280946
Change-Id: I3641785e74873438acc00d675f5bc490dfa95b50
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785972
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-18 10:40:41 +01:00 committed by Frank Barchard
parent 2d62d8d22a
commit 1c31461771
3 changed files with 46 additions and 0 deletions

View File

@ -577,6 +577,7 @@ extern "C" {
#define HAS_BGRATOYROW_NEON_DOTPROD
#define HAS_RGBATOYJROW_NEON_DOTPROD
#define HAS_RGBATOYROW_NEON_DOTPROD
#define HAS_ARGBGRAYROW_NEON_DOTPROD
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUV444ROW_NEON_I8MM
@ -6151,6 +6152,9 @@ void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_argb,
int width);
void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);

View File

@ -3751,6 +3751,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
}
#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;
@ -3806,6 +3811,11 @@ int ARGBGray(uint8_t* dst_argb,
ARGBGrayRow = ARGBGrayRow_NEON;
}
#endif
#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
}
#endif
#if defined(HAS_ARGBGRAYROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_MSA;

View File

@ -4192,6 +4192,38 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
}
static const uvec8 kARGBGrayRowCoeffs = {29, 150, 77, 0};
static const uvec8 kARGBGrayRowIndices = {0, 0, 0, 19, 2, 2, 2, 23,
4, 4, 4, 27, 6, 6, 6, 31};
void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
"ld1r {v24.4s}, [%[coeffs]] \n"
"ldr q25, [%[indices]] \n"
"1: \n"
"ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB
"movi v0.4s, #0 \n"
"movi v2.4s, #0 \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"udot v0.4s, v1.16b, v24.16b \n"
"udot v2.4s, v3.16b, v24.16b \n"
"prfm pldl1keep, [%[src], 448] \n"
"uqrshrn v0.8b, v0.8h, #8 \n"
"uqrshrn v2.8b, v2.8h, #8 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v25.16b \n" // merge in alpha
"tbl v1.16b, {v2.16b, v3.16b}, v25.16b \n"
"stp q0, q1, [%[dst]], #32 \n" // store 8 pixels
"b.gt 1b \n"
: [src] "+r"(src_argb), // %[src]
[dst] "+r"(dst_argb), // %[dst]
[width] "+r"(width) // %[width]
: [coeffs] "r"(&kARGBGrayRowCoeffs), // %[coeffs]
[indices] "r"(&kARGBGrayRowIndices) // %[indices]
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25");
}
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
// b = (r * 35 + g * 68 + b * 17) >> 7
// g = (r * 45 + g * 88 + b * 22) >> 7