diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 146feff0c..077def307 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -995,6 +995,112 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, #undef LOAD2_DATA32_LANE +// Read 16x2 average down and write 8x1. +void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #8 \n" // 8 processed per loop + "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent + "uaddlp v1.4s, v1.8h \n" + "uadalp v0.4s, v2.8h \n" // row 2 add adjacent + + // row1 + "uadalp v1.4s, v3.8h \n" + "rshrn v0.4h, v0.4s, #2 \n" // downshift, round and + // pack + "rshrn2 v0.8h, v1.4s, #2 \n" + "st1 {v0.8h}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// Read 8x2 upsample with filtering and write 16x1. +// actually reads an extra pixel, so 9x2. +void ScaleRowUp2_16_NEON(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width) { + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "movi v20.4h, #1 \n" + "movi v21.4h, #3 \n" // constants + "movi v22.4h, #9 \n" + + "1: \n" + "ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels + "ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2 + +// consider a variation of this for last 8x2 that replicates the last pixel. + "ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels + "ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2 + + "subs %w3, %w3, #16 \n" // 16 dst pixels per loop + +// filter first 2x2 group to produce 1st and 4th dest pixels +// 9 3 +// 3 1 + "umull v8.4s, v0.4h, v22.4h \n" + "umlal v8.4s, v1.4h, v21.4h \n" + "umlal v8.4s, v2.4h, v21.4h \n" + "umlal v8.4s, v3.4h, v20.4h \n" + +// filter first 2x2 group to produce 2nd and 5th dest pixel +// 3 9 +// 1 3 + "umull v9.4s, v0.4h, v21.4h \n" + "umlal v9.4s, v1.4h, v22.4h \n" + "umlal v9.4s, v2.4h, v20.4h \n" + "umlal v9.4s, v3.4h, v21.4h \n" + +// filter second 2x2 group to produce 3rd and 6th dest pixels +// 9 3 +// 3 1 + "umull v10.4s, v4.4h, v22.4h \n" + "umlal v10.4s, v5.4h, v21.4h \n" + "umlal v10.4s, v6.4h, v21.4h \n" + "umlal v10.4s, v7.4h, v20.4h \n" + +// filter second 2x2 group to produce 4th and 7th dest pixel +// 3 9 +// 1 3 + "umull v11.4s, v4.4h, v21.4h \n" + "umlal v11.4s, v5.4h, v22.4h \n" + "umlal v11.4s, v6.4h, v20.4h \n" + "umlal v11.4s, v7.4h, v21.4h \n" + + "uqrshrn v8.4h, v8.4s, #4 \n" // downshift, round + "uqrshrn v9.4h, v9.4s, #4 \n" + "uqrshrn v10.4h, v10.4s, #4 \n" + "uqrshrn v11.4h, v11.4s, #4 \n" + + "st4 {v8.4h, v9.4h, v10.4h, v11.4h}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : "r"(2LL), // %4 + "r"(14LL) // %5 + + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v20", "v21", "v22" // Clobber List + ); +} + + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 8367dd2d2..ffd05a7de 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -450,4 +450,80 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { } #endif // HAS_SCALEROWDOWN2_SSSE3 + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); + +TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { + SIMD_ALIGNED(uint16 orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun + SIMD_ALIGNED(uint16 dst_pixels_opt[1280]); + SIMD_ALIGNED(uint16 dst_pixels_c[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt)); + memset(dst_pixels_c, 2, sizeof(dst_pixels_c)); + + for (int i = 0; i < 640 * 2 + 1; ++i) { + orig_pixels[i] = i; + } + ScaleRowUp2_16_NEON(&orig_pixels[0], + 640 * 2, + &dst_pixels_c[0], + 1280); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ScaleRowUp2_16_NEON(&orig_pixels[0], + 640 * 2, + &dst_pixels_opt[0], + 1280); + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16); + EXPECT_EQ(dst_pixels_c[1279], 800); +} + +extern "C" void ScaleRowDown2Box_16_NEON(const uint16* src_ptr, + ptrdiff_t src_stride, + uint16* dst, + int dst_width); + +TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { + SIMD_ALIGNED(uint16 orig_pixels[2560 * 2]); + SIMD_ALIGNED(uint16 dst_pixels_opt[1280]); + SIMD_ALIGNED(uint16 dst_pixels_c[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt)); + memset(dst_pixels_c, 2, sizeof(dst_pixels_c)); + + for (int i = 0; i < 2560 * 2; ++i) { + orig_pixels[i] = i; + } + ScaleRowDown2Box_16_NEON(&orig_pixels[0], + 2560 * 2, + &dst_pixels_c[0], + 1280); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ScaleRowDown2Box_16_NEON(&orig_pixels[0], + 2560 * 2, + &dst_pixels_opt[0], + 1280); + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + EXPECT_EQ(dst_pixels_c[0], 1281); + EXPECT_EQ(dst_pixels_c[1279], 3839); +} +#endif // __aarch64__ + } // namespace libyuv