[AArch64] Rework data loading in ScaleFilterCols_NEON

Lane-indexed LD2 instructions are slow and introduce an unnecessary
dependency on the previous iteration of the loop. To avoid this
dependency use a scalar load for the first iteration and lane-indexed
LD1 for the remainder, then TRN1 and TRN2 to split out the even and odd
elements.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55:  -6.7%
Cortex-A510: -13.2%
Cortex-A520: -13.1%
 Cortex-A76: -54.5%
Cortex-A715: -60.3%
Cortex-A720: -61.0%
  Cortex-X1: -69.1%
  Cortex-X2: -68.6%
  Cortex-X3: -73.9%
  Cortex-X4: -73.8%
Cortex-X925: -69.0%

Bug: b/42280945
Change-Id: I1c4adfb82a43bdcf2dd4cc212088fc21a5812244
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872804
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 15:35:56 +01:00 committed by Frank Barchard
parent faade2f73f
commit 4621b0cc7f

View File

@ -1103,15 +1103,12 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
);
}
// TODO(Yang Zhang): Investigate less load instructions for
// the x/dx stepping
#define LOAD2_DATA8_LANE(n) \
"lsr %5, %3, #16 \n" \
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
#define SCALE_FILTER_COLS_STEP_ADDR \
"lsr %[tmp_offset], %x[x], #16 \n" \
"add %[tmp_ptr], %[src_ptr], %[tmp_offset] \n" \
"add %x[x], %x[x], %x[dx] \n"
// The NEON version mimics this formula (from row_common.cc):
// The Neon version mimics this formula (from scale_common.cc):
// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
@ -1121,65 +1118,69 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int x,
int dx) {
int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset;
const uint8_t* src_tmp = src_ptr;
int64_t x64 = (int64_t)x; // NOLINT
int64_t dx64 = (int64_t)dx; // NOLINT
int64_t tmp_offset;
uint8_t* tmp_ptr;
asm volatile(
"dup v0.4s, %w3 \n" // x
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"dup v0.4s, %w[x] \n"
"dup v1.4s, %w[dx] \n"
"ld1 {v2.4s}, [%[dx_offset]] \n" // 0 1 2 3
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
"shl v22.4s, v1.4s, #3 \n" // 8 * dx
"mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"add v2.4s, v1.4s, v3.4s \n"
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
"1: \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1)
LOAD2_DATA8_LANE(2)
LOAD2_DATA8_LANE(3)
LOAD2_DATA8_LANE(4)
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6)
LOAD2_DATA8_LANE(7)
"mov v6.16b, v1.16b \n"
"mov v7.16b, v2.16b \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"ushll v4.8h, v4.8b, #0 \n"
"ushll v5.8h, v5.8b, #0 \n"
"movi v0.8h, #0 \n"
// truncate to uint16_t
"trn1 v22.8h, v22.8h, v0.8h \n"
"trn1 v20.8h, v1.8h, v0.8h \n"
"trn1 v21.8h, v2.8h, v0.8h \n"
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"trn1 v4.16b, v6.16b, v0.16b \n"
"trn2 v5.16b, v6.16b, v0.16b \n"
"ssubl v16.4s, v5.4h, v4.4h \n"
"ssubl2 v17.4s, v5.8h, v4.8h \n"
"ushll v7.4s, v6.4h, #0 \n"
"ushll2 v6.4s, v6.8h, #0 \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"mul v16.4s, v16.4s, v20.4s \n"
"mul v17.4s, v17.4s, v21.4s \n"
"rshrn v6.4h, v16.4s, #16 \n"
"rshrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n"
"st1 {v4.8b}, [%0], #8 \n" // store pixels
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"add v20.8h, v20.8h, v22.8h \n"
"add v21.8h, v21.8h, v22.8h \n"
"st1 {v4.8b}, [%[dst_ptr]], #8 \n" // store pixels
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
"+r"(x64), // %3
"+r"(dx64), // %4
"+r"(tmp), // %5
"+r"(src_tmp) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3",
"v4", "v5", "v6", "v7", "v16", "v17"
);
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width), // %[width]
[x] "+r"(x), // %[x]
[dx] "+r"(dx), // %[dx]
[tmp_offset] "=&r"(tmp_offset), // %[tmp_offset]
[tmp_ptr] "=&r"(tmp_ptr) // %[tmp_ptr]
: [dx_offset] "r"(dx_offset) // %[dx_offset]
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
"v20", "v21", "v22");
}
#undef LOAD2_DATA8_LANE
#undef SCALE_FILTER_COLS_STEP_ADDR
void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,