mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[AArch64] Rework data loading in ScaleFilterCols_NEON
Lane-indexed LD2 instructions are slow and introduce an unnecessary dependency on the previous iteration of the loop. To avoid this dependency use a scalar load for the first iteration and lane-indexed LD1 for the remainder, then TRN1 and TRN2 to split out the even and odd elements. Reduction in runtimes observed compared to the existing Neon implementation: Cortex-A55: -6.7% Cortex-A510: -13.2% Cortex-A520: -13.1% Cortex-A76: -54.5% Cortex-A715: -60.3% Cortex-A720: -61.0% Cortex-X1: -69.1% Cortex-X2: -68.6% Cortex-X3: -73.9% Cortex-X4: -73.8% Cortex-X925: -69.0% Bug: b/42280945 Change-Id: I1c4adfb82a43bdcf2dd4cc212088fc21a5812244 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872804 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
faade2f73f
commit
4621b0cc7f
@ -1103,15 +1103,12 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO(Yang Zhang): Investigate less load instructions for
|
#define SCALE_FILTER_COLS_STEP_ADDR \
|
||||||
// the x/dx stepping
|
"lsr %[tmp_offset], %x[x], #16 \n" \
|
||||||
#define LOAD2_DATA8_LANE(n) \
|
"add %[tmp_ptr], %[src_ptr], %[tmp_offset] \n" \
|
||||||
"lsr %5, %3, #16 \n" \
|
"add %x[x], %x[x], %x[dx] \n"
|
||||||
"add %6, %1, %5 \n" \
|
|
||||||
"add %3, %3, %4 \n" \
|
|
||||||
"ld2 {v4.b, v5.b}[" #n "], [%6] \n"
|
|
||||||
|
|
||||||
// The NEON version mimics this formula (from row_common.cc):
|
// The Neon version mimics this formula (from scale_common.cc):
|
||||||
// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
|
// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
|
||||||
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
|
// ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
|
||||||
|
|
||||||
@ -1121,65 +1118,69 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
|
|||||||
int x,
|
int x,
|
||||||
int dx) {
|
int dx) {
|
||||||
int dx_offset[4] = {0, 1, 2, 3};
|
int dx_offset[4] = {0, 1, 2, 3};
|
||||||
int* tmp = dx_offset;
|
int64_t tmp_offset;
|
||||||
const uint8_t* src_tmp = src_ptr;
|
uint8_t* tmp_ptr;
|
||||||
int64_t x64 = (int64_t)x; // NOLINT
|
asm volatile(
|
||||||
int64_t dx64 = (int64_t)dx; // NOLINT
|
"dup v0.4s, %w[x] \n"
|
||||||
asm volatile (
|
"dup v1.4s, %w[dx] \n"
|
||||||
"dup v0.4s, %w3 \n" // x
|
"ld1 {v2.4s}, [%[dx_offset]] \n" // 0 1 2 3
|
||||||
"dup v1.4s, %w4 \n" // dx
|
|
||||||
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
|
|
||||||
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
|
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
|
||||||
|
"shl v22.4s, v1.4s, #3 \n" // 8 * dx
|
||||||
|
|
||||||
"mul v1.4s, v1.4s, v2.4s \n"
|
"mul v1.4s, v1.4s, v2.4s \n"
|
||||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||||
"add v1.4s, v1.4s, v0.4s \n"
|
"add v1.4s, v1.4s, v0.4s \n"
|
||||||
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
||||||
"add v2.4s, v1.4s, v3.4s \n"
|
"add v2.4s, v1.4s, v3.4s \n"
|
||||||
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
|
|
||||||
"1: \n"
|
"movi v0.8h, #0 \n"
|
||||||
LOAD2_DATA8_LANE(0)
|
|
||||||
LOAD2_DATA8_LANE(1)
|
// truncate to uint16_t
|
||||||
LOAD2_DATA8_LANE(2)
|
"trn1 v22.8h, v22.8h, v0.8h \n"
|
||||||
LOAD2_DATA8_LANE(3)
|
"trn1 v20.8h, v1.8h, v0.8h \n"
|
||||||
LOAD2_DATA8_LANE(4)
|
"trn1 v21.8h, v2.8h, v0.8h \n"
|
||||||
LOAD2_DATA8_LANE(5)
|
|
||||||
LOAD2_DATA8_LANE(6)
|
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
LOAD2_DATA8_LANE(7)
|
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
"mov v6.16b, v1.16b \n"
|
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
"mov v7.16b, v2.16b \n"
|
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
"uzp1 v6.8h, v6.8h, v7.8h \n"
|
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
"ushll v4.8h, v4.8b, #0 \n"
|
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
"ushll v5.8h, v5.8b, #0 \n"
|
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
|
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||||
|
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
|
||||||
|
|
||||||
|
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
|
||||||
|
"trn1 v4.16b, v6.16b, v0.16b \n"
|
||||||
|
"trn2 v5.16b, v6.16b, v0.16b \n"
|
||||||
|
|
||||||
"ssubl v16.4s, v5.4h, v4.4h \n"
|
"ssubl v16.4s, v5.4h, v4.4h \n"
|
||||||
"ssubl2 v17.4s, v5.8h, v4.8h \n"
|
"ssubl2 v17.4s, v5.8h, v4.8h \n"
|
||||||
"ushll v7.4s, v6.4h, #0 \n"
|
"mul v16.4s, v16.4s, v20.4s \n"
|
||||||
"ushll2 v6.4s, v6.8h, #0 \n"
|
"mul v17.4s, v17.4s, v21.4s \n"
|
||||||
"mul v16.4s, v16.4s, v7.4s \n"
|
|
||||||
"mul v17.4s, v17.4s, v6.4s \n"
|
|
||||||
"rshrn v6.4h, v16.4s, #16 \n"
|
"rshrn v6.4h, v16.4s, #16 \n"
|
||||||
"rshrn2 v6.8h, v17.4s, #16 \n"
|
"rshrn2 v6.8h, v17.4s, #16 \n"
|
||||||
"add v4.8h, v4.8h, v6.8h \n"
|
"add v4.8h, v4.8h, v6.8h \n"
|
||||||
"xtn v4.8b, v4.8h \n"
|
"xtn v4.8b, v4.8h \n"
|
||||||
|
|
||||||
"st1 {v4.8b}, [%0], #8 \n" // store pixels
|
"add v20.8h, v20.8h, v22.8h \n"
|
||||||
"add v1.4s, v1.4s, v0.4s \n"
|
"add v21.8h, v21.8h, v22.8h \n"
|
||||||
"add v2.4s, v2.4s, v0.4s \n"
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
"st1 {v4.8b}, [%[dst_ptr]], #8 \n" // store pixels
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(dst_ptr), // %0
|
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||||
"+r"(src_ptr), // %1
|
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||||
"+r"(dst_width), // %2
|
[width] "+r"(dst_width), // %[width]
|
||||||
"+r"(x64), // %3
|
[x] "+r"(x), // %[x]
|
||||||
"+r"(dx64), // %4
|
[dx] "+r"(dx), // %[dx]
|
||||||
"+r"(tmp), // %5
|
[tmp_offset] "=&r"(tmp_offset), // %[tmp_offset]
|
||||||
"+r"(src_tmp) // %6
|
[tmp_ptr] "=&r"(tmp_ptr) // %[tmp_ptr]
|
||||||
:
|
: [dx_offset] "r"(dx_offset) // %[dx_offset]
|
||||||
: "memory", "cc", "v0", "v1", "v2", "v3",
|
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
|
||||||
"v4", "v5", "v6", "v7", "v16", "v17"
|
"v20", "v21", "v22");
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef LOAD2_DATA8_LANE
|
#undef SCALE_FILTER_COLS_STEP_ADDR
|
||||||
|
|
||||||
void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user