mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
GaussRow_NEON from int to short
[ RUN ] LibYUVPlanarTest.TestGaussRow_Opt [ OK ] LibYUVPlanarTest.TestGaussRow_Opt (601 ms) [ RUN ] LibYUVPlanarTest.TestGaussCol_Opt [ OK ] LibYUVPlanarTest.TestGaussCol_Opt (522 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussRow_Opt Change-Id: I1242b98672538e889f3ab48f215d6dabc7144ea7 Reviewed-on: https://chromium-review.googlesource.com/627478 Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
1cc539f7d6
commit
ad2409443c
@ -2672,6 +2672,15 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void GaussRow_C(const uint32* src, uint16* dst, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ =
|
||||
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
|
||||
++src;
|
||||
}
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_C(const uint16* src0,
|
||||
const uint16* src1,
|
||||
|
||||
@ -2692,13 +2692,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
: "cc", "memory", "v1", "v2");
|
||||
}
|
||||
|
||||
static vec16 kGauseCoefficients[4] = {
|
||||
{1, 4, 6, 4, 1, 0, 0, 0},
|
||||
{0, 1, 4, 6, 4, 1, 0, 0},
|
||||
{0, 0, 1, 4, 6, 4, 1, 0},
|
||||
{0, 0, 0, 1, 4, 6, 4, 1},
|
||||
};
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_NEON(const uint16* src0,
|
||||
const uint16* src1,
|
||||
@ -2719,15 +2712,15 @@ void GaussCol_NEON(const uint16* src0,
|
||||
"ld1 {v5.8h}, [%4], #16 \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
|
||||
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
|
||||
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
|
||||
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1
|
||||
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
|
||||
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
|
||||
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
|
||||
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"umlal v0.4s, v3.4h, v7.4h \n" // * 6
|
||||
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
|
||||
"umlal v0.4s, v4.4h, v6.4h \n" // * 4
|
||||
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
|
||||
|
||||
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
@ -2743,41 +2736,164 @@ void GaussCol_NEON(const uint16* src0,
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
#if 0
|
||||
a8: ad7f8d82 ldp q2, q3, [x12,#-16]
|
||||
ac: 3cdf8186 ldur q6, [x12,#-8]
|
||||
b0: 3cdf4184 ldur q4, [x12,#-12]
|
||||
b4: 3cc04185 ldur q5, [x12,#4]
|
||||
b8: 3cc08187 ldur q7, [x12,#8]
|
||||
bc: 3cdfc190 ldur q16, [x12,#-4]
|
||||
c0: 3cc0c191 ldur q17, [x12,#12]
|
||||
c4: 3dc00592 ldr q18, [x12,#16]
|
||||
c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6
|
||||
cc: 4ea48604 add v4.4s, v16.4s, v4.4s
|
||||
d0: 4ea58625 add v5.4s, v17.4s, v5.4s
|
||||
d4: 4ea38442 add v2.4s, v2.4s, v3.4s
|
||||
d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6
|
||||
dc: 4f225484 shl v4.4s, v4.4s, #2
|
||||
e0: 4f2254a5 shl v5.4s, v5.4s, #2
|
||||
e4: 4eb28463 add v3.4s, v3.4s, v18.4s
|
||||
e8: 4ea48442 add v2.4s, v2.4s, v4.4s
|
||||
ec: 4ea58463 add v3.4s, v3.4s, v5.4s
|
||||
f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128
|
||||
f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128
|
||||
f8: 0f188442 shrn v2.4h, v2.4s, #8
|
||||
fc: 0f188463 shrn v3.4h, v3.4s, #8
|
||||
100: f10021ad subs x13, x13, #0x8
|
||||
104: 6d3f8d62 stp d2, d3, [x11,#-8]
|
||||
108: 9100416b add x11, x11, #0x10
|
||||
10c: 9100818c add x12, x12, #0x20
|
||||
110: 54fffcc1 b.ne a8 <GaussRow_C+0xa8>
|
||||
#endif
|
||||
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
|
||||
void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
|
||||
asm volatile(
|
||||
"ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [%3] \n"
|
||||
"movi v0.4s, #6 \n" // constant 6
|
||||
"add %0, %0, #0x10 \n"
|
||||
"add %1, %1, #0x8 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%0], %4 \n" // load 8 source samples
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
|
||||
"umull v1.4s, v0.4h, v20.4h \n" // first pixel
|
||||
"umlal2 v1.4s, v0.8h, v20.8h \n"
|
||||
"addv s1, v1.4s \n"
|
||||
|
||||
"umull v2.4s, v0.4h, v21.4h \n" // second pixel
|
||||
"umlal2 v2.4s, v0.8h, v21.8h \n"
|
||||
"addv s2, v2.4s \n"
|
||||
|
||||
"umull v3.4s, v0.4h, v22.4h \n" // third pixel
|
||||
"umlal2 v3.4s, v0.8h, v22.8h \n"
|
||||
"addv s3, v3.4s \n"
|
||||
|
||||
"umull v4.4s, v0.4h, v23.4h \n" // forth pixel
|
||||
"umlal2 v4.4s, v0.8h, v23.8h \n"
|
||||
"addv s4, v4.4s \n"
|
||||
|
||||
"st4 {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16 \n" // store 4 samples
|
||||
"ldp q2, q3, [%0,#-16] \n"
|
||||
"ldur q6, [%0,#-8] \n"
|
||||
"ldur q4, [%0,#-12] \n"
|
||||
"ldur q5, [%0,#4] \n"
|
||||
"ldur q7, [%0,#8] \n"
|
||||
"ldur q16, [%0,#-4] \n"
|
||||
"ldur q17, [%0,#12] \n"
|
||||
"ldr q18, [%0,#16] \n"
|
||||
"mla v2.4s, v6.4s, v0.4s \n"
|
||||
"add v4.4s, v16.4s, v4.4s \n"
|
||||
"add v5.4s, v17.4s, v5.4s \n"
|
||||
"add v2.4s, v2.4s, v3.4s \n"
|
||||
"mla v3.4s, v7.4s, v0.4s \n"
|
||||
"shl v4.4s, v4.4s, #2 \n"
|
||||
"shl v5.4s, v5.4s, #2 \n"
|
||||
"add v3.4s, v3.4s, v18.4s \n"
|
||||
"add v2.4s, v2.4s, v4.4s \n"
|
||||
"add v3.4s, v3.4s, v5.4s \n"
|
||||
"add v2.4s, v2.4s, v1.4s \n"
|
||||
"add v3.4s, v3.4s, v1.4s \n"
|
||||
"shrn v2.4h, v2.4s, #8 \n"
|
||||
"shrn v3.4h, v3.4s, #8 \n"
|
||||
"subs %w2, %w2, #0x8 \n"
|
||||
"stp d2, d3, [%1,#-8] \n"
|
||||
"add %1, %1, #0x10 \n"
|
||||
"add %0, %0, #0x20 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src0), // %0
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&kGauseCoefficients[0]), // %3
|
||||
"r"(8LL) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22",
|
||||
"v23");
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18" );
|
||||
}
|
||||
|
||||
|
||||
void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ =
|
||||
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
|
||||
++src;
|
||||
}
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
|
||||
const uint32* src1 = src + 1;
|
||||
const uint32* src2 = src + 2;
|
||||
const uint32* src3 = src + 3;
|
||||
asm volatile(
|
||||
"movi v6.4s, #4 \n" // constant 4
|
||||
"movi v7.4s, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples
|
||||
"ld1 {v4.4s,v5.4s}, [%1], #32 \n"
|
||||
"ld1 {v16.4s,v17.4s}, [%2], #32 \n"
|
||||
"ld1 {v18.4s,v19.4s}, [%3], #32 \n"
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"add v0.4s, v1.4s, v2.4s \n" // * 1
|
||||
"add v1.4s, v2.4s, v3.4s \n" // * 1
|
||||
"add v2.4s, v4.4s, v18.4s \n" // add rows for * 4
|
||||
"add v3.4s, v5.4s, v19.4s \n"
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v1.4s, v3.4s, v6.4s \n" // * 4
|
||||
"mla v0.4s, v16.4s, v7.4s \n" // * 6
|
||||
"mla v1.4s, v17.4s, v7.4s \n" // * 6
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
"uqrshrn2 v0.8h, v1.4s, #8 \n"
|
||||
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(32LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19" );
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
|
||||
const uint32* src1 = src + 1;
|
||||
const uint32* src2 = src + 2;
|
||||
const uint32* src3 = src + 3;
|
||||
asm volatile(
|
||||
"movi v6.4s, #4 \n" // constant 4
|
||||
"movi v7.4s, #6 \n" // constant 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples
|
||||
"ld1 {v2.4s}, [%1], #16 \n"
|
||||
"ld1 {v3.4s}, [%2], #16 \n"
|
||||
"ld1 {v4.4s}, [%3], #16 \n"
|
||||
"subs %w5, %w5, #4 \n" // 4 processed per loop
|
||||
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v0.4s, v3.4s, v7.4s \n" // * 6
|
||||
"mla v0.4s, v4.4s, v6.4s \n" // * 4
|
||||
"add v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
|
||||
"st1 {v0.4h}, [%4], #8 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(16LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
@ -2725,23 +2725,22 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width);
|
||||
extern "C" void GaussRow_NEON(const uint32* src, uint16* dst, int width);
|
||||
extern "C" void GaussRow_C(const uint32* src, uint16* dst, int width);
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
||||
SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]);
|
||||
SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
|
||||
SIMD_ALIGNED(uint32 orig_pixels[1280 + 4]);
|
||||
SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 1280 + 4; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
orig_pixels[i] = i * 256;
|
||||
}
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
@ -2749,10 +2748,10 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
||||
if (has_neon) {
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -2824,6 +2823,4 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
EXPECT_EQ(dst_pixels_c[1279], 61424);
|
||||
}
|
||||
|
||||
#endif // aarch64
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user