GaussRow_NEON from int to short

[ RUN      ] LibYUVPlanarTest.TestGaussRow_Opt
 [       OK ] LibYUVPlanarTest.TestGaussRow_Opt (601 ms)
 [ RUN      ] LibYUVPlanarTest.TestGaussCol_Opt
 [       OK ] LibYUVPlanarTest.TestGaussCol_Opt (522 ms)

TBR=kjellander@chromium.org
BUG=libyuv:719
TEST=LibYUVPlanarTest.TestGaussRow_Opt

Change-Id: I1242b98672538e889f3ab48f215d6dabc7144ea7
Reviewed-on: https://chromium-review.googlesource.com/627478
Reviewed-by: Cheng Wang <wangcheng@google.com>
Reviewed-by: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2017-08-23 16:01:55 -07:00
parent 1cc539f7d6
commit ad2409443c
3 changed files with 175 additions and 53 deletions

View File

@ -2672,6 +2672,15 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
} }
} }
void GaussRow_C(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_C(const uint16* src0, void GaussCol_C(const uint16* src0,
const uint16* src1, const uint16* src1,

View File

@ -2692,13 +2692,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
: "cc", "memory", "v1", "v2"); : "cc", "memory", "v1", "v2");
} }
static vec16 kGauseCoefficients[4] = {
{1, 4, 6, 4, 1, 0, 0, 0},
{0, 1, 4, 6, 4, 1, 0, 0},
{0, 0, 1, 4, 6, 4, 1, 0},
{0, 0, 0, 1, 4, 6, 4, 1},
};
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussCol_NEON(const uint16* src0, void GaussCol_NEON(const uint16* src0,
const uint16* src1, const uint16* src1,
@ -2719,15 +2712,15 @@ void GaussCol_NEON(const uint16* src0,
"ld1 {v5.8h}, [%4], #16 \n" "ld1 {v5.8h}, [%4], #16 \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop "subs %w6, %w6, #8 \n" // 8 processed per loop
"uaddl v0.4s, v1.4h, v5.4h \n" // * 1 "uaddl v0.4s, v1.4h, v5.4h \n" // * 1
"uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1 "uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1
"umlal v0.4s, v2.4h, v6.4h \n" // * 4 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"umlal v0.4s, v3.4h, v7.4h \n" // * 6 "umlal v0.4s, v3.4h, v7.4h \n" // * 6
"umlal2 v1.4s, v3.8h, v7.8h \n" // * 6 "umlal2 v1.4s, v3.8h, v7.8h \n" // * 6
"umlal v0.4s, v4.4h, v6.4h \n" // * 4 "umlal v0.4s, v4.4h, v6.4h \n" // * 4
"umlal2 v1.4s, v4.8h, v6.8h \n" // * 4 "umlal2 v1.4s, v4.8h, v6.8h \n" // * 4
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n" "b.gt 1b \n"
@ -2743,41 +2736,164 @@ void GaussCol_NEON(const uint16* src0,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
#if 0
a8: ad7f8d82 ldp q2, q3, [x12,#-16]
ac: 3cdf8186 ldur q6, [x12,#-8]
b0: 3cdf4184 ldur q4, [x12,#-12]
b4: 3cc04185 ldur q5, [x12,#4]
b8: 3cc08187 ldur q7, [x12,#8]
bc: 3cdfc190 ldur q16, [x12,#-4]
c0: 3cc0c191 ldur q17, [x12,#12]
c4: 3dc00592 ldr q18, [x12,#16]
c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6
cc: 4ea48604 add v4.4s, v16.4s, v4.4s
d0: 4ea58625 add v5.4s, v17.4s, v5.4s
d4: 4ea38442 add v2.4s, v2.4s, v3.4s
d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6
dc: 4f225484 shl v4.4s, v4.4s, #2
e0: 4f2254a5 shl v5.4s, v5.4s, #2
e4: 4eb28463 add v3.4s, v3.4s, v18.4s
e8: 4ea48442 add v2.4s, v2.4s, v4.4s
ec: 4ea58463 add v3.4s, v3.4s, v5.4s
f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128
f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128
f8: 0f188442 shrn v2.4h, v2.4s, #8
fc: 0f188463 shrn v3.4h, v3.4s, #8
100: f10021ad subs x13, x13, #0x8
104: 6d3f8d62 stp d2, d3, [x11,#-8]
108: 9100416b add x11, x11, #0x10
10c: 9100818c add x12, x12, #0x20
110: 54fffcc1 b.ne a8 <GaussRow_C+0xa8>
#endif
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint16* src0, uint16* dst, int width) { void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
asm volatile( asm volatile(
"ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [%3] \n" "movi v0.4s, #6 \n" // constant 6
"add %0, %0, #0x10 \n"
"add %1, %1, #0x8 \n"
"1: \n" "1: \n"
"ld1 {v0.8h}, [%0], %4 \n" // load 8 source samples
"subs %w2, %w2, #4 \n" // 4 processed per loop
"umull v1.4s, v0.4h, v20.4h \n" // first pixel "ldp q2, q3, [%0,#-16] \n"
"umlal2 v1.4s, v0.8h, v20.8h \n" "ldur q6, [%0,#-8] \n"
"addv s1, v1.4s \n" "ldur q4, [%0,#-12] \n"
"ldur q5, [%0,#4] \n"
"umull v2.4s, v0.4h, v21.4h \n" // second pixel "ldur q7, [%0,#8] \n"
"umlal2 v2.4s, v0.8h, v21.8h \n" "ldur q16, [%0,#-4] \n"
"addv s2, v2.4s \n" "ldur q17, [%0,#12] \n"
"ldr q18, [%0,#16] \n"
"umull v3.4s, v0.4h, v22.4h \n" // third pixel "mla v2.4s, v6.4s, v0.4s \n"
"umlal2 v3.4s, v0.8h, v22.8h \n" "add v4.4s, v16.4s, v4.4s \n"
"addv s3, v3.4s \n" "add v5.4s, v17.4s, v5.4s \n"
"add v2.4s, v2.4s, v3.4s \n"
"umull v4.4s, v0.4h, v23.4h \n" // forth pixel "mla v3.4s, v7.4s, v0.4s \n"
"umlal2 v4.4s, v0.8h, v23.8h \n" "shl v4.4s, v4.4s, #2 \n"
"addv s4, v4.4s \n" "shl v5.4s, v5.4s, #2 \n"
"add v3.4s, v3.4s, v18.4s \n"
"st4 {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16 \n" // store 4 samples "add v2.4s, v2.4s, v4.4s \n"
"add v3.4s, v3.4s, v5.4s \n"
"add v2.4s, v2.4s, v1.4s \n"
"add v3.4s, v3.4s, v1.4s \n"
"shrn v2.4h, v2.4s, #8 \n"
"shrn v3.4h, v3.4s, #8 \n"
"subs %w2, %w2, #0x8 \n"
"stp d2, d3, [%1,#-8] \n"
"add %1, %1, #0x10 \n"
"add %0, %0, #0x20 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src0), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(&kGauseCoefficients[0]), // %3 :
"r"(8LL) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22", "v16", "v17", "v18" );
"v23"); }
void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
int i;
for (i = 0; i < width; ++i) {
*dst++ =
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
++src;
}
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples
"ld1 {v4.4s,v5.4s}, [%1], #32 \n"
"ld1 {v16.4s,v17.4s}, [%2], #32 \n"
"ld1 {v18.4s,v19.4s}, [%3], #32 \n"
"subs %w5, %w5, #8 \n" // 8 processed per loop
"add v0.4s, v1.4s, v2.4s \n" // * 1
"add v1.4s, v2.4s, v3.4s \n" // * 1
"add v2.4s, v4.4s, v18.4s \n" // add rows for * 4
"add v3.4s, v5.4s, v19.4s \n"
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"mla v0.4s, v16.4s, v7.4s \n" // * 6
"mla v1.4s, v17.4s, v7.4s \n" // * 6
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(32LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19" );
}
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
const uint32* src1 = src + 1;
const uint32* src2 = src + 2;
const uint32* src3 = src + 3;
asm volatile(
"movi v6.4s, #4 \n" // constant 4
"movi v7.4s, #6 \n" // constant 6
"1: \n"
"ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples
"ld1 {v2.4s}, [%1], #16 \n"
"ld1 {v3.4s}, [%2], #16 \n"
"ld1 {v4.4s}, [%3], #16 \n"
"subs %w5, %w5, #4 \n" // 4 processed per loop
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v0.4s, v3.4s, v7.4s \n" // * 6
"mla v0.4s, v4.4s, v6.4s \n" // * 4
"add v0.4s, v0.4s, v1.4s \n" // * 1
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"st1 {v0.4h}, [%4], #8 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(width) // %5
: "r"(16LL) // %6
: "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
} }
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

View File

@ -2725,23 +2725,22 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) extern "C" void GaussRow_NEON(const uint32* src, uint16* dst, int width);
extern "C" void GaussRow_C(const uint32* src, uint16* dst, int width);
extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width);
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]); SIMD_ALIGNED(uint32 orig_pixels[1280 + 4]);
SIMD_ALIGNED(uint32 dst_pixels_c[1280]); SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
SIMD_ALIGNED(uint32 dst_pixels_opt[1280]); SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
memset(orig_pixels, 0, sizeof(orig_pixels)); memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
for (int i = 0; i < 1280 + 4; ++i) { for (int i = 0; i < 1280 + 4; ++i) {
orig_pixels[i] = i; orig_pixels[i] = i * 256;
} }
GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280); GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
@ -2749,10 +2748,10 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
if (has_neon) { if (has_neon) {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} else { } else {
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
} }
#else #else
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
#endif #endif
} }
@ -2824,6 +2823,4 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
EXPECT_EQ(dst_pixels_c[1279], 61424); EXPECT_EQ(dst_pixels_c[1279], 61424);
} }
#endif // aarch64
} // namespace libyuv } // namespace libyuv