[AArch64] Rework data loading in ScaleFilterCols_NEON

Lane-indexed LD2 instructions are slow and introduce an unnecessary
dependency on the previous iteration of the loop. To avoid this
dependency use a scalar load for the first iteration and lane-indexed
LD1 for the remainder, then TRN1 and TRN2 to split out the even and odd
elements.

Reduction in runtimes observed compared to the existing Neon
implementation:

 Cortex-A55:  -6.7%
Cortex-A510: -13.2%
Cortex-A520: -13.1%
 Cortex-A76: -54.5%
Cortex-A715: -60.3%
Cortex-A720: -61.0%
  Cortex-X1: -69.1%
  Cortex-X2: -68.6%
  Cortex-X3: -73.9%
  Cortex-X4: -73.8%
Cortex-X925: -69.0%

Bug: b/42280945
Change-Id: I1c4adfb82a43bdcf2dd4cc212088fc21a5812244
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872804
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-15 15:35:56 +01:00 committed by Frank Barchard
parent faade2f73f
commit 4621b0cc7f

View File

@ -1103,15 +1103,12 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for #define SCALE_FILTER_COLS_STEP_ADDR \
// the x/dx stepping "lsr %[tmp_offset], %x[x], #16 \n" \
#define LOAD2_DATA8_LANE(n) \ "add %[tmp_ptr], %[src_ptr], %[tmp_offset] \n" \
"lsr %5, %3, #16 \n" \ "add %x[x], %x[x], %x[dx] \n"
"add %6, %1, %5 \n" \
"add %3, %3, %4 \n" \
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
// The NEON version mimics this formula (from row_common.cc): // The Neon version mimics this formula (from scale_common.cc):
// #define BLENDER(a, b, f) (uint8_t)((int)(a) + // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
@ -1121,65 +1118,69 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
int x, int x,
int dx) { int dx) {
int dx_offset[4] = {0, 1, 2, 3}; int dx_offset[4] = {0, 1, 2, 3};
int* tmp = dx_offset; int64_t tmp_offset;
const uint8_t* src_tmp = src_ptr; uint8_t* tmp_ptr;
int64_t x64 = (int64_t)x; // NOLINT asm volatile(
int64_t dx64 = (int64_t)dx; // NOLINT "dup v0.4s, %w[x] \n"
asm volatile ( "dup v1.4s, %w[dx] \n"
"dup v0.4s, %w3 \n" // x "ld1 {v2.4s}, [%[dx_offset]] \n" // 0 1 2 3
"dup v1.4s, %w4 \n" // dx
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
"shl v3.4s, v1.4s, #2 \n" // 4 * dx "shl v3.4s, v1.4s, #2 \n" // 4 * dx
"shl v22.4s, v1.4s, #3 \n" // 8 * dx
"mul v1.4s, v1.4s, v2.4s \n" "mul v1.4s, v1.4s, v2.4s \n"
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
"add v1.4s, v1.4s, v0.4s \n" "add v1.4s, v1.4s, v0.4s \n"
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
"add v2.4s, v1.4s, v3.4s \n" "add v2.4s, v1.4s, v3.4s \n"
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
"1: \n" "movi v0.8h, #0 \n"
LOAD2_DATA8_LANE(0)
LOAD2_DATA8_LANE(1) // truncate to uint16_t
LOAD2_DATA8_LANE(2) "trn1 v22.8h, v22.8h, v0.8h \n"
LOAD2_DATA8_LANE(3) "trn1 v20.8h, v1.8h, v0.8h \n"
LOAD2_DATA8_LANE(4) "trn1 v21.8h, v2.8h, v0.8h \n"
LOAD2_DATA8_LANE(5)
LOAD2_DATA8_LANE(6) "1: \n" SCALE_FILTER_COLS_STEP_ADDR
LOAD2_DATA8_LANE(7) "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"mov v6.16b, v1.16b \n" "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"mov v7.16b, v2.16b \n" "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"uzp1 v6.8h, v6.8h, v7.8h \n" "ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ushll v4.8h, v4.8b, #0 \n" "ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ushll v5.8h, v5.8b, #0 \n" "ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"trn1 v4.16b, v6.16b, v0.16b \n"
"trn2 v5.16b, v6.16b, v0.16b \n"
"ssubl v16.4s, v5.4h, v4.4h \n" "ssubl v16.4s, v5.4h, v4.4h \n"
"ssubl2 v17.4s, v5.8h, v4.8h \n" "ssubl2 v17.4s, v5.8h, v4.8h \n"
"ushll v7.4s, v6.4h, #0 \n" "mul v16.4s, v16.4s, v20.4s \n"
"ushll2 v6.4s, v6.8h, #0 \n" "mul v17.4s, v17.4s, v21.4s \n"
"mul v16.4s, v16.4s, v7.4s \n"
"mul v17.4s, v17.4s, v6.4s \n"
"rshrn v6.4h, v16.4s, #16 \n" "rshrn v6.4h, v16.4s, #16 \n"
"rshrn2 v6.8h, v17.4s, #16 \n" "rshrn2 v6.8h, v17.4s, #16 \n"
"add v4.8h, v4.8h, v6.8h \n" "add v4.8h, v4.8h, v6.8h \n"
"xtn v4.8b, v4.8h \n" "xtn v4.8b, v4.8h \n"
"st1 {v4.8b}, [%0], #8 \n" // store pixels "add v20.8h, v20.8h, v22.8h \n"
"add v1.4s, v1.4s, v0.4s \n" "add v21.8h, v21.8h, v22.8h \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop "st1 {v4.8b}, [%[dst_ptr]], #8 \n" // store pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_ptr), // %0 : [src_ptr] "+r"(src_ptr), // %[src_ptr]
"+r"(src_ptr), // %1 [dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
"+r"(dst_width), // %2 [width] "+r"(dst_width), // %[width]
"+r"(x64), // %3 [x] "+r"(x), // %[x]
"+r"(dx64), // %4 [dx] "+r"(dx), // %[dx]
"+r"(tmp), // %5 [tmp_offset] "=&r"(tmp_offset), // %[tmp_offset]
"+r"(src_tmp) // %6 [tmp_ptr] "=&r"(tmp_ptr) // %[tmp_ptr]
: : [dx_offset] "r"(dx_offset) // %[dx_offset]
: "memory", "cc", "v0", "v1", "v2", "v3", : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
"v4", "v5", "v6", "v7", "v16", "v17" "v20", "v21", "v22");
);
} }
#undef LOAD2_DATA8_LANE #undef SCALE_FILTER_COLS_STEP_ADDR
void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,