diff --git a/source/row_common.cc b/source/row_common.cc index 436e6dd3f..154ec3652 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2672,6 +2672,20 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) { } } +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_C(const uint16* src0, + const uint16* src1, + const uint16* src2, + const uint16* src3, + const uint16* src4, + uint32* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_neon64.cc b/source/row_neon64.cc index cb8192b0c..292c6e229 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2699,6 +2699,50 @@ static vec16 kGauseCoefficients[4] = { {0, 0, 0, 1, 4, 6, 4, 1}, }; +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_NEON(const uint16* src0, + const uint16* src1, + const uint16* src2, + const uint16* src3, + const uint16* src4, + uint32* dst, + int width) { + asm volatile( + "movi v6.8h, #4 \n" // constant 4 + "movi v7.8h, #6 \n" // constant 6 + + "1: \n" + "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%1], #16 \n" + "ld1 {v3.8h}, [%2], #16 \n" + "ld1 {v4.8h}, [%3], #16 \n" + "ld1 {v5.8h}, [%4], #16 \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + + "uaddl v0.4s, v1.4h, v5.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1 + + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "umlal v0.4s, v3.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v3.8h, v7.8h \n" // * 6 + "umlal v0.4s, v4.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v4.8h, v6.8h \n" // * 4 + + "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_NEON(const uint16* src0, uint16* dst, int width) { asm volatile( @@ -2736,46 +2780,6 @@ void GaussRow_NEON(const uint16* src0, uint16* dst, int width) { "v23"); } -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussCol_NEON(const uint32* src0, - const uint32* src1, - const uint32* src2, - const uint32* src3, - const uint32* src4, - uint16* dst, - int width) { - asm volatile( - "movi v5.4s, #4 \n" // constant 4 - "movi v6.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s}, [%0], #16 \n" // load 4 samples, 5 rows - "ld1 {v1.4s}, [%1], #16 \n" - "ld1 {v2.4s}, [%2], #16 \n" - "ld1 {v3.4s}, [%3], #16 \n" - "ld1 {v4.4s}, [%4], #16 \n" - "subs %w6, %w6, #4 \n" // 4 processed per loop - - "add v0.4s, v0.4s, v4.4s \n" // * 1 - "mla v0.4s, v1.4s, v5.4s \n" // * 4 - "mla v0.4s, v2.4s, v6.4s \n" // * 6 - "mla v0.4s, v3.4s, v5.4s \n" // * 4 - - "uqshrn v0.4h, v0.4s, #8 \n" // round, shift by 8 pack. - "st1 {v0.4h}, [%5], #8 \n" // store 4 samples - "b.gt 1b \n" - - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : - : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); -} - #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index fe19e6b97..25fd8e3b0 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2742,6 +2742,7 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { orig_pixels[i] = i; } GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280); + MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); @@ -2760,22 +2761,29 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { } EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1); - EXPECT_EQ(dst_pixels_c[1279], - 1279 * 1 + 1280 * 4 + 1281 * 6 + 1282 * 4 + 1283 * 1); + EXPECT_EQ(dst_pixels_c[1279], 20496); } -extern "C" void GaussCol_NEON(const uint32* src0, - const uint32* src1, - const uint32* src2, - const uint32* src3, - const uint32* src4, - uint16* dst, +extern "C" void GaussCol_NEON(const uint16* src0, + const uint16* src1, + const uint16* src2, + const uint16* src3, + const uint16* src4, + uint32* dst, int width); +extern "C" void GaussCol_C(const uint16* src0, + const uint16* src1, + const uint16* src2, + const uint16* src3, + const uint16* src4, + uint32* dst, + int width); + TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { - SIMD_ALIGNED(uint32 orig_pixels[1280 * 5]); - SIMD_ALIGNED(uint16 dst_pixels_c[1280]); - SIMD_ALIGNED(uint16 dst_pixels_opt[1280]); + SIMD_ALIGNED(uint16 orig_pixels[1280 * 5]); + SIMD_ALIGNED(uint32 dst_pixels_c[1280]); + SIMD_ALIGNED(uint32 dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); @@ -2784,9 +2792,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { for (int i = 0; i < 1280 * 5; ++i) { orig_pixels[i] = i; } - GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], - &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], - &dst_pixels_c[0], 1280); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0], + 1280); + MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); @@ -2795,14 +2804,14 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); } else { - GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], - &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], - &dst_pixels_opt[0], 1280); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); } #else - GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], - &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], - &dst_pixels_opt[0], 1280); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); #endif } @@ -2810,10 +2819,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - EXPECT_EQ(dst_pixels_c[0], (0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + - 1280 * 4 * 1 + 128) / - 256); - EXPECT_EQ(dst_pixels_c[1279], 239); + EXPECT_EQ(dst_pixels_c[0], + 0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1); + EXPECT_EQ(dst_pixels_c[1279], 61424); } #endif // aarch64