mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Gaussian reorder for benefit of A73
Roughly. instead of 4 loads and 8 multiples, use 1 load and 2 multiples 4 times over. The original code, as with the C code from clang and gcc, did all the loads, then all the math, then the store. The new code does a load, then the math, then the next load, etc. This schedules better on current arm 64 cpus. Number of registers also reduced, reusing the same registers. HiSilicon ARM A73: Now TestGaussRow_Opt (890 ms) TestGaussCol_Opt (571 ms) Was TestGaussRow_Opt (1061 ms) TestGaussCol_Opt (595 ms) Qualcomm 821 (Pixel): Now TestGaussRow_Opt (571 ms) TestGaussCol_Opt (474 ms) Was TestGaussRow_Opt (751 ms) TestGaussCol_Opt (520 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussRow_Opt Reviewed-on: https://chromium-review.googlesource.com/627478 Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Change-Id: I5ec81191d460801f0d4a89f0384f89925ff036de Reviewed-on: https://chromium-review.googlesource.com/634448 Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
ad2409443c
commit
f0a9d6d206
@ -2706,22 +2706,19 @@ void GaussCol_NEON(const uint16* src0,
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.8h}, [%4], #16 \n"
|
||||
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
|
||||
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
|
||||
"ld1 {v2.8h}, [%1], #16 \n"
|
||||
"ld1 {v3.8h}, [%2], #16 \n"
|
||||
"ld1 {v4.8h}, [%3], #16 \n"
|
||||
"ld1 {v5.8h}, [%4], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"ld1 {v2.8h}, [%2], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
|
||||
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
|
||||
"ld1 {v2.8h}, [%3], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
|
||||
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
|
||||
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
|
||||
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
|
||||
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
|
||||
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
|
||||
|
||||
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
@ -2733,93 +2730,7 @@ void GaussCol_NEON(const uint16* src0,
|
||||
"+r"(dst), // %5
|
||||
"+r"(width) // %6
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
#if 0
|
||||
a8: ad7f8d82 ldp q2, q3, [x12,#-16]
|
||||
ac: 3cdf8186 ldur q6, [x12,#-8]
|
||||
b0: 3cdf4184 ldur q4, [x12,#-12]
|
||||
b4: 3cc04185 ldur q5, [x12,#4]
|
||||
b8: 3cc08187 ldur q7, [x12,#8]
|
||||
bc: 3cdfc190 ldur q16, [x12,#-4]
|
||||
c0: 3cc0c191 ldur q17, [x12,#12]
|
||||
c4: 3dc00592 ldr q18, [x12,#16]
|
||||
c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6
|
||||
cc: 4ea48604 add v4.4s, v16.4s, v4.4s
|
||||
d0: 4ea58625 add v5.4s, v17.4s, v5.4s
|
||||
d4: 4ea38442 add v2.4s, v2.4s, v3.4s
|
||||
d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6
|
||||
dc: 4f225484 shl v4.4s, v4.4s, #2
|
||||
e0: 4f2254a5 shl v5.4s, v5.4s, #2
|
||||
e4: 4eb28463 add v3.4s, v3.4s, v18.4s
|
||||
e8: 4ea48442 add v2.4s, v2.4s, v4.4s
|
||||
ec: 4ea58463 add v3.4s, v3.4s, v5.4s
|
||||
f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128
|
||||
f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128
|
||||
f8: 0f188442 shrn v2.4h, v2.4s, #8
|
||||
fc: 0f188463 shrn v3.4h, v3.4s, #8
|
||||
100: f10021ad subs x13, x13, #0x8
|
||||
104: 6d3f8d62 stp d2, d3, [x11,#-8]
|
||||
108: 9100416b add x11, x11, #0x10
|
||||
10c: 9100818c add x12, x12, #0x20
|
||||
110: 54fffcc1 b.ne a8 <GaussRow_C+0xa8>
|
||||
#endif
|
||||
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
|
||||
asm volatile(
|
||||
"movi v0.4s, #6 \n" // constant 6
|
||||
"add %0, %0, #0x10 \n"
|
||||
"add %1, %1, #0x8 \n"
|
||||
|
||||
"1: \n"
|
||||
|
||||
"ldp q2, q3, [%0,#-16] \n"
|
||||
"ldur q6, [%0,#-8] \n"
|
||||
"ldur q4, [%0,#-12] \n"
|
||||
"ldur q5, [%0,#4] \n"
|
||||
"ldur q7, [%0,#8] \n"
|
||||
"ldur q16, [%0,#-4] \n"
|
||||
"ldur q17, [%0,#12] \n"
|
||||
"ldr q18, [%0,#16] \n"
|
||||
"mla v2.4s, v6.4s, v0.4s \n"
|
||||
"add v4.4s, v16.4s, v4.4s \n"
|
||||
"add v5.4s, v17.4s, v5.4s \n"
|
||||
"add v2.4s, v2.4s, v3.4s \n"
|
||||
"mla v3.4s, v7.4s, v0.4s \n"
|
||||
"shl v4.4s, v4.4s, #2 \n"
|
||||
"shl v5.4s, v5.4s, #2 \n"
|
||||
"add v3.4s, v3.4s, v18.4s \n"
|
||||
"add v2.4s, v2.4s, v4.4s \n"
|
||||
"add v3.4s, v3.4s, v5.4s \n"
|
||||
"add v2.4s, v2.4s, v1.4s \n"
|
||||
"add v3.4s, v3.4s, v1.4s \n"
|
||||
"shrn v2.4h, v2.4s, #8 \n"
|
||||
"shrn v3.4h, v3.4s, #8 \n"
|
||||
"subs %w2, %w2, #0x8 \n"
|
||||
"stp d2, d3, [%1,#-8] \n"
|
||||
"add %1, %1, #0x10 \n"
|
||||
"add %0, %0, #0x20 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18" );
|
||||
}
|
||||
|
||||
|
||||
void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ =
|
||||
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
|
||||
++src;
|
||||
}
|
||||
: "cc", "memory", "v0", "v1", "v2", "v6", "v7");
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
@ -2832,19 +2743,19 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
|
||||
"movi v7.4s, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples
|
||||
"ld1 {v4.4s,v5.4s}, [%1], #32 \n"
|
||||
"ld1 {v16.4s,v17.4s}, [%2], #32 \n"
|
||||
"ld1 {v18.4s,v19.4s}, [%3], #32 \n"
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"add v0.4s, v1.4s, v2.4s \n" // * 1
|
||||
"add v1.4s, v2.4s, v3.4s \n" // * 1
|
||||
"add v2.4s, v4.4s, v18.4s \n" // add rows for * 4
|
||||
"add v3.4s, v5.4s, v19.4s \n"
|
||||
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
|
||||
"add v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"add v1.4s, v1.4s, v2.4s \n" // * 1
|
||||
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
|
||||
"mla v0.4s, v2.4s, v7.4s \n" // * 6
|
||||
"mla v1.4s, v3.4s, v7.4s \n" // * 6
|
||||
"ld1 {v2.4s,v3.4s}, [%1], #32 \n"
|
||||
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
|
||||
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
|
||||
"add v3.4s, v3.4s, v5.4s \n"
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v1.4s, v3.4s, v6.4s \n" // * 4
|
||||
"mla v0.4s, v16.4s, v7.4s \n" // * 6
|
||||
"mla v1.4s, v17.4s, v7.4s \n" // * 6
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
"uqrshrn2 v0.8h, v1.4s, #8 \n"
|
||||
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
|
||||
@ -2856,44 +2767,8 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(32LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19" );
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
|
||||
const uint32* src1 = src + 1;
|
||||
const uint32* src2 = src + 2;
|
||||
const uint32* src3 = src + 3;
|
||||
asm volatile(
|
||||
"movi v6.4s, #4 \n" // constant 4
|
||||
"movi v7.4s, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples
|
||||
"ld1 {v2.4s}, [%1], #16 \n"
|
||||
"ld1 {v3.4s}, [%2], #16 \n"
|
||||
"ld1 {v4.4s}, [%3], #16 \n"
|
||||
"subs %w5, %w5, #4 \n" // 4 processed per loop
|
||||
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v0.4s, v3.4s, v7.4s \n" // * 6
|
||||
"mla v0.4s, v4.4s, v6.4s \n" // * 4
|
||||
"add v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
|
||||
"st1 {v0.4h}, [%4], #8 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(16LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
|
||||
: "r"(32LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
@ -2759,8 +2759,9 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
|
||||
EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1);
|
||||
EXPECT_EQ(dst_pixels_c[1279], 20496);
|
||||
EXPECT_EQ(dst_pixels_c[0],
|
||||
static_cast<uint16>(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1));
|
||||
EXPECT_EQ(dst_pixels_c[1279], static_cast<uint16>(20496));
|
||||
}
|
||||
|
||||
extern "C" void GaussCol_NEON(const uint16* src0,
|
||||
@ -2819,8 +2820,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
}
|
||||
|
||||
EXPECT_EQ(dst_pixels_c[0],
|
||||
0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1);
|
||||
EXPECT_EQ(dst_pixels_c[1279], 61424);
|
||||
static_cast<uint32>(0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 +
|
||||
1280 * 4 * 1));
|
||||
EXPECT_EQ(dst_pixels_c[1279], static_cast<uint32>(61424));
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user