mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
[AArch64] Add Neon dot-product implementation for ARGBGrayRow
We can use dot product instructions to apply the coefficients without needing to use LD4 deinterleaving load instructions, and then TBL to mix in the original alpha component. This is significantly faster on some micro-architectures where LD4 instructions are known to be slow compared to normal loads. Reduction in cycle counts observed compared to existing Neon code: Cortex-A55: -12.6% Cortex-A510: -48.6% Cortex-A76: -39.7% Cortex-A720: -52.3% Cortex-X1: -63.5% Cortex-X2: -67.0% Bug: b/42280946 Change-Id: I3641785e74873438acc00d675f5bc490dfa95b50 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785972 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
2d62d8d22a
commit
1c31461771
@ -577,6 +577,7 @@ extern "C" {
|
||||
#define HAS_BGRATOYROW_NEON_DOTPROD
|
||||
#define HAS_RGBATOYJROW_NEON_DOTPROD
|
||||
#define HAS_RGBATOYROW_NEON_DOTPROD
|
||||
#define HAS_ARGBGRAYROW_NEON_DOTPROD
|
||||
|
||||
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||
@ -6151,6 +6152,9 @@ void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
|
||||
void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width);
|
||||
void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBGrayRow_LSX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
void ARGBGrayRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width);
|
||||
|
||||
@ -3751,6 +3751,11 @@ int ARGBGrayTo(const uint8_t* src_argb,
|
||||
ARGBGrayRow = ARGBGrayRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
|
||||
ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBGRAYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
|
||||
ARGBGrayRow = ARGBGrayRow_MSA;
|
||||
@ -3806,6 +3811,11 @@ int ARGBGray(uint8_t* dst_argb,
|
||||
ARGBGrayRow = ARGBGrayRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBGRAYROW_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd) && IS_ALIGNED(width, 8)) {
|
||||
ARGBGrayRow = ARGBGrayRow_NEON_DotProd;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBGRAYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
|
||||
ARGBGrayRow = ARGBGrayRow_MSA;
|
||||
|
||||
@ -4192,6 +4192,38 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
|
||||
}
|
||||
|
||||
static const uvec8 kARGBGrayRowCoeffs = {29, 150, 77, 0};
|
||||
static const uvec8 kARGBGrayRowIndices = {0, 0, 0, 19, 2, 2, 2, 23,
|
||||
4, 4, 4, 27, 6, 6, 6, 31};
|
||||
|
||||
void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"ld1r {v24.4s}, [%[coeffs]] \n"
|
||||
"ldr q25, [%[indices]] \n"
|
||||
"1: \n"
|
||||
"ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB
|
||||
"movi v0.4s, #0 \n"
|
||||
"movi v2.4s, #0 \n"
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
|
||||
"udot v0.4s, v1.16b, v24.16b \n"
|
||||
"udot v2.4s, v3.16b, v24.16b \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #8 \n"
|
||||
"uqrshrn v2.8b, v2.8h, #8 \n"
|
||||
"tbl v0.16b, {v0.16b, v1.16b}, v25.16b \n" // merge in alpha
|
||||
"tbl v1.16b, {v2.16b, v3.16b}, v25.16b \n"
|
||||
"stp q0, q1, [%[dst]], #32 \n" // store 8 pixels
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_argb), // %[src]
|
||||
[dst] "+r"(dst_argb), // %[dst]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [coeffs] "r"(&kARGBGrayRowCoeffs), // %[coeffs]
|
||||
[indices] "r"(&kARGBGrayRowIndices) // %[indices]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25");
|
||||
}
|
||||
|
||||
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
|
||||
// b = (r * 35 + g * 68 + b * 17) >> 7
|
||||
// g = (r * 45 + g * 88 + b * 22) >> 7
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user