[AArch64] Add I8MM implementation of ARGBColorMatrixRow

We cannot use the standard dot-product instructions since the matrix of
coefficients are signed, but I8MM supports mixed-sign products which
work well here.

Reduction in runtimes observed compared to the previous Neon
implementation:

Cortex-A510: -50.8%
Cortex-A520: -33.3%
Cortex-A715: -38.6%
Cortex-A720: -38.5%
  Cortex-X2: -43.2%
  Cortex-X3: -40.0%
  Cortex-X4: -55.0%

Change-Id: Ia4fe486faf8f43d0b837ad21bb37e2159f3bdb77
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5621577
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-04-10 15:46:42 +01:00 committed by Frank Barchard
parent 89cf221baa
commit a758a15dbf
3 changed files with 67 additions and 0 deletions

View File

@ -569,6 +569,8 @@ extern "C" {
#define HAS_BGRATOYROW_NEON_DOTPROD
#define HAS_RGBATOYJROW_NEON_DOTPROD
#define HAS_RGBATOYROW_NEON_DOTPROD
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
#endif
// The following are available on AArch64 SVE platforms:
@ -6022,6 +6024,10 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width);
void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width);
void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,

View File

@ -3907,6 +3907,11 @@ int ARGBColorMatrix(const uint8_t* src_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
#if defined(HAS_ARGBCOLORMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNeonI8MM) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON_I8MM;
}
#endif
#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;

View File

@ -4039,6 +4039,62 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
"v17", "v18", "v19", "v22", "v23", "v24", "v25");
}
void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width) {
asm("ld1 {v31.16b}, [%[matrix_argb]] \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n"
"movi v16.4s, #0 \n"
"movi v17.4s, #0 \n"
"movi v18.4s, #0 \n"
"movi v19.4s, #0 \n"
"movi v20.4s, #0 \n"
"movi v21.4s, #0 \n"
"movi v22.4s, #0 \n"
"movi v23.4s, #0 \n"
// 8 processed per loop.
"subs %w2, %w2, #8 \n"
"prfm pldl1keep, [%[src_argb], 448] \n"
"sudot v16.4s, v31.16b, v0.4b[0] \n"
"sudot v17.4s, v31.16b, v0.4b[1] \n"
"sudot v18.4s, v31.16b, v0.4b[2] \n"
"sudot v19.4s, v31.16b, v0.4b[3] \n"
"sudot v20.4s, v31.16b, v1.4b[0] \n"
"sudot v21.4s, v31.16b, v1.4b[1] \n"
"sudot v22.4s, v31.16b, v1.4b[2] \n"
"sudot v23.4s, v31.16b, v1.4b[3] \n"
"shrn v16.4h, v16.4s, #6 \n"
"shrn v18.4h, v18.4s, #6 \n"
"shrn v20.4h, v20.4s, #6 \n"
"shrn v22.4h, v22.4s, #6 \n"
"shrn2 v16.8h, v17.4s, #6 \n"
"shrn2 v18.8h, v19.4s, #6 \n"
"shrn2 v20.8h, v21.4s, #6 \n"
"shrn2 v22.8h, v23.4s, #6 \n"
"uqxtn v16.8b, v16.8h \n"
"uqxtn v18.8b, v18.8h \n"
"uqxtn v20.8b, v20.8h \n"
"uqxtn v22.8b, v22.8h \n"
"stp d16, d18, [%[dst_argb]], #16 \n"
"stp d20, d22, [%[dst_argb]], #16 \n"
"b.gt 1b \n"
: [src_argb] "+r"(src_argb), // %[src_argb]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width) // %[width]
: [matrix_argb] "r"(matrix_argb) // %[matrix_argb]
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
"v22", "v23", "v31");
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,