From c5bad809b19031a5288b0c7bb1923de931ab29f8 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 21 Aug 2017 16:14:56 -0700 Subject: [PATCH] Gauss unittest, Scale comments for neon64 half size updated [ RUN ] LibYUVPlanarTest.TestGaussRow_Opt [ OK ] LibYUVPlanarTest.TestGaussRow_Opt (1274 ms) [ RUN ] LibYUVPlanarTest.TestGaussCol_Opt [ OK ] LibYUVPlanarTest.TestGaussCol_Opt (916 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussRow_Opt Change-Id: Id480f3870c40c2b40dfb9f072cb7118ebad41afc Reviewed-on: https://chromium-review.googlesource.com/624701 Reviewed-by: Cheng Wang --- source/row_neon64.cc | 28 ++++++------ source/scale_neon64.cc | 6 +-- unit_test/planar_test.cc | 93 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 19 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 5c9994d52..cb8192b0c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2693,16 +2693,14 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { } static vec16 kGauseCoefficients[4] = { - {1, 4, 6, 4, 1, 0, 0, 0}, - {0, 1, 4, 6, 4, 1, 0, 0}, - {0, 0, 1, 4, 6, 4, 1, 0}, - {0, 0, 0, 1, 4, 6, 4, 1}, + {1, 4, 6, 4, 1, 0, 0, 0}, + {0, 1, 4, 6, 4, 1, 0, 0}, + {0, 0, 1, 4, 6, 4, 1, 0}, + {0, 0, 0, 1, 4, 6, 4, 1}, }; // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON(const uint16* src0, - uint16* dst, - int width) { +void GaussRow_NEON(const uint16* src0, uint16* dst, int width) { asm volatile( "ld1 {v20.8h,v21.8h,v22.8h,v23.8h}, [%3] \n" @@ -2725,17 +2723,17 @@ void GaussRow_NEON(const uint16* src0, "umull v4.4s, v0.4h, v23.4h \n" // forth pixel "umlal2 v4.4s, v0.8h, v23.8h \n" "addv s4, v4.4s \n" - + "st4 {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16 \n" // store 4 samples "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kGauseCoefficients[0]) // %3 - "r"(8LL) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", - "v20", "v21", "v22", "v23"); + : "+r"(src0), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGauseCoefficients[0]), // %3 + "r"(8LL) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22", + "v23"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 36318a9b1..7f5d0c414 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1009,11 +1009,9 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, "subs %w3, %w3, #8 \n" // 8 processed per loop "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent "uaddlp v1.4s, v1.8h \n" - "uadalp v0.4s, v2.8h \n" // row 2 add adjacent + - // row1 + "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent "uadalp v1.4s, v3.8h \n" - "rshrn v0.4h, v0.4s, #2 \n" // downshift, round and - // pack + "rshrn v0.4h, v0.4s, #2 \n" // round and pack "rshrn2 v0.8h, v1.4s, #2 \n" "st1 {v0.8h}, [%2], #16 \n" "b.gt 1b \n" diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index dbae3658b..fe19e6b97 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2725,4 +2725,97 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) { EXPECT_EQ(0, diff); } +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width); + +TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { + SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]); + SIMD_ALIGNED(uint32 dst_pixels_c[1280]); + SIMD_ALIGNED(uint32 dst_pixels_opt[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 + 4; ++i) { + orig_pixels[i] = i; + } + GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } else { + GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } +#else + GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1); + EXPECT_EQ(dst_pixels_c[1279], + 1279 * 1 + 1280 * 4 + 1281 * 6 + 1282 * 4 + 1283 * 1); +} + +extern "C" void GaussCol_NEON(const uint32* src0, + const uint32* src1, + const uint32* src2, + const uint32* src3, + const uint32* src4, + uint16* dst, + int width); + +TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { + SIMD_ALIGNED(uint32 orig_pixels[1280 * 5]); + SIMD_ALIGNED(uint16 dst_pixels_c[1280]); + SIMD_ALIGNED(uint16 dst_pixels_opt[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 * 5; ++i) { + orig_pixels[i] = i; + } + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); + } else { + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); + } +#else + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + EXPECT_EQ(dst_pixels_c[0], (0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + + 1280 * 4 * 1 + 128) / + 256); + EXPECT_EQ(dst_pixels_c[1279], 239); +} + +#endif // aarch64 + } // namespace libyuv