From 8f5e9cd9ebc76c39155c48e8420fe76ef7cac185 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 5 Sep 2017 14:40:27 -0700 Subject: [PATCH] ScaleRowUp2_16_C port of NEON to C Single pass upsample with bilinear filter. NEON version optimized - Pixel Sailfish QC821 Was TestScaleRowUp2_16 (5741 ms) Now TestScaleRowUp2_16 (4484 ms) C TestScaleRowUp2_16 (6555 ms) TBR=kjellander@chromium.org BUG=libyuv:718 TEST=LibYUVScaleTest.TestScaleRowUp2_16 (709 ms) Change-Id: Ib04ceb53e0ab644a392c39c3396e313530161d92 Reviewed-on: https://chromium-review.googlesource.com/646701 Reviewed-by: Frank Barchard Reviewed-by: Cheng Wang --- source/scale_common.cc | 29 ++++++++++++++ source/scale_neon64.cc | 84 +++++++++++++++-------------------------- unit_test/scale_test.cc | 23 ++++++++--- 3 files changed, 76 insertions(+), 60 deletions(-) diff --git a/source/scale_common.cc b/source/scale_common.cc index 1bef39df8..fefb027bf 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1306,6 +1306,35 @@ void ScaleSlope(int src_width, } #undef CENTERSTART +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { + const uint16* src2 = src_ptr + src_stride; + + int x; + for (x = 0; x < dst_width - 1; x += 2) { + uint16 p0 = src_ptr[0]; + uint16 p1 = src_ptr[1]; + uint16 p2 = src2[0]; + uint16 p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; + ++src_ptr; + ++src2; + dst += 2; + } + if (dst_width & 1) { + uint16 p0 = src_ptr[0]; + uint16 p1 = src_ptr[1]; + uint16 p2 = src2[0]; + uint16 p3 = src2[1]; + dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index c8e576883..5e79ece67 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1010,66 +1010,43 @@ void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, } // Read 8x2 upsample with filtering and write 16x1. -// actually reads an extra pixel, so 9x2. +// Actually reads an extra pixel, so 9x2. void ScaleRowUp2_16_NEON(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst, int dst_width) { asm volatile( "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 - "movi v20.4h, #1 \n" - "movi v21.4h, #3 \n" // constants - "movi v22.4h, #9 \n" + "movi v0.8h, #9 \n" // constants + "movi v1.4s, #3 \n" "1: \n" - "ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels - "ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2 - - // consider a variation of this for last 8x2 that replicates the last - // pixel. - "ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels - "ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2 - + "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 + "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 + "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row + "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop - - // filter first 2x2 group to produce 1st and 4th dest pixels - // 9 3 - // 3 1 - "umull v8.4s, v0.4h, v22.4h \n" - "umlal v8.4s, v1.4h, v21.4h \n" - "umlal v8.4s, v2.4h, v21.4h \n" - "umlal v8.4s, v3.4h, v20.4h \n" - - // filter first 2x2 group to produce 2nd and 5th dest pixel - // 3 9 - // 1 3 - "umull v9.4s, v0.4h, v21.4h \n" - "umlal v9.4s, v1.4h, v22.4h \n" - "umlal v9.4s, v2.4h, v20.4h \n" - "umlal v9.4s, v3.4h, v21.4h \n" - - // filter second 2x2 group to produce 3rd and 6th dest pixels - // 9 3 - // 3 1 - "umull v10.4s, v4.4h, v22.4h \n" - "umlal v10.4s, v5.4h, v21.4h \n" - "umlal v10.4s, v6.4h, v21.4h \n" - "umlal v10.4s, v7.4h, v20.4h \n" - - // filter second 2x2 group to produce 4th and 7th dest pixel - // 3 9 - // 1 3 - "umull v11.4s, v4.4h, v21.4h \n" - "umlal v11.4s, v5.4h, v22.4h \n" - "umlal v11.4s, v6.4h, v20.4h \n" - "umlal v11.4s, v7.4h, v21.4h \n" - - "uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round - "uqrshrn v9.4h, v9.4s, #4 \n" - "uqrshrn v10.4h, v10.4s, #4 \n" - "uqrshrn v11.4h, v11.4s, #4 \n" - - "st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n" + "umull v16.4s, v3.4h, v0.4h \n" + "umull2 v7.4s, v3.8h, v0.8h \n" + "umull v18.4s, v4.4h, v0.4h \n" + "umull2 v17.4s, v4.8h, v0.8h \n" + "uaddw v16.4s, v16.4s, v6.4h \n" + "uaddl2 v19.4s, v6.8h, v3.8h \n" + "uaddl v3.4s, v6.4h, v3.4h \n" + "uaddw2 v6.4s, v7.4s, v6.8h \n" + "uaddl2 v7.4s, v5.8h, v4.8h \n" + "uaddl v4.4s, v5.4h, v4.4h \n" + "uaddw v18.4s, v18.4s, v5.4h \n" + "mla v16.4s, v4.4s, v1.4s \n" + "mla v18.4s, v3.4s, v1.4s \n" + "mla v6.4s, v7.4s, v1.4s \n" + "uaddw2 v4.4s, v17.4s, v5.8h \n" + "uqrshrn v16.4h, v16.4s, #4 \n" + "mla v4.4s, v19.4s, v1.4s \n" + "uqrshrn2 v16.8h, v6.4s, #4 \n" + "uqrshrn v17.4h, v18.4s, #4 \n" + "uqrshrn2 v17.8h, v4.4s, #4 \n" + "st2 {v16.8h-v17.8h}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 @@ -1077,9 +1054,8 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, "+r"(dst_width) // %3 : "r"(2LL), // %4 "r"(14LL) // %5 - - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11", "v20", "v21", "v22" // Clobber List + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19" // Clobber List ); } diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 355c6d08a..a782a0915 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -450,12 +450,14 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { } #endif // HAS_SCALEROWDOWN2_SSSE3 -#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst, int dst_width); +extern "C" void ScaleRowUp2_16_C(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun @@ -469,10 +471,19 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { for (int i = 0; i < 640 * 2 + 1; ++i) { orig_pixels[i] = i; } - ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); - + ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); + MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); + } else { + ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); + } +#else + ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); +#endif } for (int i = 0; i < 1280; ++i) { @@ -481,7 +492,6 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16); EXPECT_EQ(dst_pixels_c[1279], 800); } -#endif extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, ptrdiff_t src_stride, @@ -501,6 +511,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { orig_pixels[i] = i; } ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280); + MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON);