[AArch64] Unroll and use TBL in ScaleRowDown34_NEON

ST3 is known to be slow on a number of modern micro-architectures. By
unrolling the code we are able to use TBL to shuffle elements into the
correct indices without needing to use LD4 and ST3, giving a good
improvement in performance across the board.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55: -14.4%
Cortex-A510: -66.0%
Cortex-A520: -50.8%
 Cortex-A76: -60.5%
Cortex-A715: -63.9%
Cortex-A720: -64.2%
  Cortex-X1: -74.3%
  Cortex-X2: -75.4%
  Cortex-X3: -75.5%
  Cortex-X4: -48.1%

Bug: b/42280945
Change-Id: Ia1efb03af2d6ec00bc5a4b72168963fede9f0c83
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5785971
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 21:30:25 +01:00 committed by Frank Barchard
parent d5303f4f77
commit 23a6a412e5
2 changed files with 39 additions and 18 deletions

View File

@ -249,13 +249,13 @@ SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
23)
#endif
#ifdef HAS_SCALEROWDOWN34_NEON
#ifdef __aarch64__
SDANY(ScaleRowDown34_Any_NEON,
ScaleRowDown34_NEON,
ScaleRowDown34_C,
4 / 3,
1,
23)
#ifdef __aarch64__
47)
SDANY(ScaleRowDown34_0_Box_Any_NEON,
ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C,
@ -269,6 +269,12 @@ SDANY(ScaleRowDown34_1_Box_Any_NEON,
1,
47)
#else
SDANY(ScaleRowDown34_Any_NEON,
ScaleRowDown34_NEON,
ScaleRowDown34_C,
4 / 3,
1,
23)
SDANY(ScaleRowDown34_0_Box_Any_NEON,
ScaleRowDown34_0_Box_NEON,
ScaleRowDown34_0_Box_C,

View File

@ -155,27 +155,42 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
// Down scale from 4 to 3 pixels. Use the neon multilane read/write
// to load up the every 4th pixel into a 4 different registers.
// Point samples 32 pixels to 24 pixels.
static const uvec8 kShuf34_0 = {
0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20,
};
static const uvec8 kShuf34_1 = {
5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25,
};
static const uvec8 kShuf34_2 = {
11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31,
};
// Down scale from 4 to 3 pixels. Point samples 64 pixels to 48 pixels.
void ScaleRowDown34_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #24 \n"
"mov v2.16b, v3.16b \n" // order v0,v1,v2
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "v0", "v1", "v2", "v3");
asm volatile(
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n"
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
"subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n"
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
"b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width) // %[width]
: [kShuf34_0] "r"(&kShuf34_0), // %[kShuf34_0]
[kShuf34_1] "r"(&kShuf34_1), // %[kShuf34_1]
[kShuf34_2] "r"(&kShuf34_2) // %[kShuf34_2]
: "memory", "cc", "v0", "v1", "v2", "v3", "v29", "v30", "v31");
}
void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,