[AArch64] Optimize ScaleRowDown38_3_Box_NEON

Replace LD4 and TRN instructions with LD1s and TBL since LD4 is known to
be slow on some micro-architectures, and remove other unnecessary
permutes.

Reduction in run times:

 Cortex-A55: -24.8%
Cortex-A510: -32.7%
Cortex-A520: -37.7%
 Cortex-A76: -51.8%
Cortex-A715: -58.9%
Cortex-A720: -58.9%
  Cortex-X1: -54.8%
  Cortex-X2: -50.3%
  Cortex-X3: -57.1%
  Cortex-X4: -49.8%
Cortex-X925: -52.0%

Co-authored-by: Cosmina Dunca <cosmina.dunca@arm.com>
Bug: b/42280945
Change-Id: Ie96bac30fffbe41f8d1501ee289795830ab127e5
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872803
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-09-17 13:41:08 +01:00 committed by Frank Barchard
parent 0bce5120f6
commit 775fd92e59

View File

@ -324,14 +324,11 @@ static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
22, 24, 27, 30, 0, 0, 0, 0};
static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
34, 6, 22, 35, 0, 0, 0, 0};
static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12};
static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18};
static const vec16 kMult38_Div664 = {
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
65536 / 18, 65536 / 18, 65536 / 12,
0, 0};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8_t* src_ptr,
@ -367,135 +364,80 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
: "memory", "cc", "v0", "v1", "v2", "v3");
}
// 32x3 -> 12x1
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
static const uvec8 kScaleRowDown38_3_BoxIndices1[] = {
0, 1, 6, 7, 12, 13, 16, 17, 22, 23, 28, 29, 255, 255, 255, 255};
static const uvec8 kScaleRowDown38_3_BoxIndices2[] = {
2, 3, 8, 9, 14, 15, 18, 19, 24, 25, 30, 31, 255, 255, 255, 255};
static const uvec8 kScaleRowDown38_3_BoxIndices3[] = {
4, 5, 10, 11, 255, 255, 20, 21, 26, 27, 255, 255, 255, 255, 255, 255};
static const uvec8 kScaleRowDown38_NarrowIndices[] = {
0, 2, 4, 6, 8, 10, 16, 18, 20, 22, 24, 26, 255, 255, 255, 255};
void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
ptrdiff_t tmp_src_stride = src_stride;
const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
asm volatile(
"ld1 {v29.8h}, [%5] \n"
"ld1 {v30.16b}, [%6] \n"
"ld1 {v31.8h}, [%7] \n"
"add %2, %2, %0 \n"
"ld1 {v27.16b}, [%[tblArray1]] \n"
"ld1 {v28.16b}, [%[tblArray2]] \n"
"ld1 {v29.16b}, [%[tblArray3]] \n"
"ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n"
// 00 40 01 41 02 42 03 43
// 10 50 11 51 12 52 13 53
// 20 60 21 61 22 62 23 63
// 30 70 31 71 32 72 33 73
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
"subs %w4, %w4, #12 \n"
"subs %w[width], %w[width], #12 \n"
// Shuffle the input data around to get align the data
// so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
// 00 10 01 11 02 12 03 13
// 40 50 41 51 42 52 43 53
"trn1 v20.8b, v0.8b, v1.8b \n"
"trn2 v21.8b, v0.8b, v1.8b \n"
"trn1 v22.8b, v4.8b, v5.8b \n"
"trn2 v23.8b, v4.8b, v5.8b \n"
"trn1 v24.8b, v16.8b, v17.8b \n"
"trn2 v25.8b, v16.8b, v17.8b \n"
"uaddl v23.8h, v20.8b, v21.8b \n"
"uaddl v3.8h, v0.8b, v1.8b \n"
"uaddl2 v24.8h, v20.16b, v21.16b \n"
"uaddl2 v4.8h, v0.16b, v1.16b \n"
// 20 30 21 31 22 32 23 33
// 60 70 61 71 62 72 63 73
"trn1 v0.8b, v2.8b, v3.8b \n"
"trn2 v1.8b, v2.8b, v3.8b \n"
"trn1 v4.8b, v6.8b, v7.8b \n"
"trn2 v5.8b, v6.8b, v7.8b \n"
"trn1 v16.8b, v18.8b, v19.8b \n"
"trn2 v17.8b, v18.8b, v19.8b \n"
"uaddw v23.8h, v23.8h, v22.8b \n"
"uaddw v3.8h, v3.8h, v2.8b \n"
"uaddw2 v24.8h, v24.8h, v22.16b \n"
"uaddw2 v4.8h, v4.8h, v2.16b \n"
// 00+10 01+11 02+12 03+13
// 40+50 41+51 42+52 43+53
"uaddlp v20.4h, v20.8b \n"
"uaddlp v21.4h, v21.8b \n"
"uaddlp v22.4h, v22.8b \n"
"uaddlp v23.4h, v23.8b \n"
"uaddlp v24.4h, v24.8b \n"
"uaddlp v25.4h, v25.8b \n"
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n"
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n"
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n"
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
// 60+70 61+71 62+72 63+73
"uaddlp v1.4h, v1.8b \n"
"uaddlp v5.4h, v5.8b \n"
"uaddlp v17.4h, v17.8b \n"
// combine source lines
"add v20.4h, v20.4h, v22.4h \n"
"add v21.4h, v21.4h, v23.4h \n"
"add v20.4h, v20.4h, v24.4h \n"
"add v21.4h, v21.4h, v25.4h \n"
"add v2.4h, v1.4h, v5.4h \n"
"add v2.4h, v2.4h, v17.4h \n"
// dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
// + s[6 + st * 1] + s[7 + st * 1]
// + s[6 + st * 2] + s[7 + st * 2]) / 6
"sqrdmulh v2.8h, v2.8h, v29.8h \n"
"xtn v2.8b, v2.8h \n"
// Shuffle 2,3 reg around so that 2 can be added to the
// 0,1 reg and 3 can be added to the 4,5 reg. This
// requires expanding from u8 to u16 as the 0,1 and 4,5
// registers are already expanded. Then do transposes
// to get aligned.
// xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
"ushll v16.8h, v16.8b, #0 \n"
"uaddl v0.8h, v0.8b, v4.8b \n"
// combine source lines
"add v0.8h, v0.8h, v16.8h \n"
// xx 20 xx 21 xx 22 xx 23
// xx 30 xx 31 xx 32 xx 33
"trn1 v1.8h, v0.8h, v0.8h \n"
"trn2 v4.8h, v0.8h, v0.8h \n"
"xtn v0.4h, v1.4s \n"
"xtn v4.4h, v4.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
// 0+1+2, 3+4+5
"add v20.8h, v20.8h, v0.8h \n"
"add v21.8h, v21.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
// Need to divide, but can't downshift as the the value
// isn't a power of 2. So multiply by 65536 / n
// and take the upper 16 bits.
"sqrdmulh v0.8h, v20.8h, v31.8h \n"
"sqrdmulh v1.8h, v21.8h, v31.8h \n"
"prfm pldl1keep, [%3, 448] \n"
// Align for table lookup, vtbl requires registers to be adjacent
"tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
"add v23.8h, v20.8h, v21.8h \n"
"add v3.8h, v0.8h, v1.8h \n"
"add v24.8h, v23.8h, v22.8h \n"
"add v4.8h, v3.8h, v2.8h \n"
"sqrdmulh v24.8h, v24.8h, v30.8h \n"
"sqrdmulh v25.8h, v4.8h, v30.8h \n"
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n"
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
"+r"(src_ptr1), // %3
"+r"(dst_width) // %4
: "r"(&kMult38_Div6), // %5
"r"(&kShuf38_2), // %6
"r"(&kMult38_Div9) // %7
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
"v30", "v31");
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
[src_ptr2] "+r"(src_ptr2), // %[src_ptr2]
[width] "+r"(dst_width) // %[width]
: [div996] "r"(&kMult38_Div996), // %[div996]
[tblArray1] "r"(kScaleRowDown38_3_BoxIndices1), // %[tblArray1]
[tblArray2] "r"(kScaleRowDown38_3_BoxIndices2), // %[tblArray2]
[tblArray3] "r"(kScaleRowDown38_3_BoxIndices3), // %[tblArray3]
[tblArray4] "r"(kScaleRowDown38_NarrowIndices) // %[tblArray4]
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "22", "23",
"24", "v27", "v28", "v29", "v30", "v31");
}
static const uvec8 kScaleRowDown38_2_BoxIndices1[] = {
0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 255, 255, 255, 255};
static const uvec8 kScaleRowDown38_2_BoxIndices2[] = {
2, 18, 5, 21, 255, 255, 10, 26, 13, 29, 255, 255, 255, 255, 255, 255};
static const uvec8 kScaleRowDown38_NarrowIndices[] = {
0, 2, 4, 6, 8, 10, 16, 18, 20, 22, 24, 26, 255, 255, 255, 255};
void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,