diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e7496b813..aef974330 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2706,22 +2706,19 @@ void GaussCol_NEON(const uint16* src0, "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows + "ld1 {v2.8h}, [%4], #16 \n" + "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 + "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 "ld1 {v2.8h}, [%1], #16 \n" - "ld1 {v3.8h}, [%2], #16 \n" - "ld1 {v4.8h}, [%3], #16 \n" - "ld1 {v5.8h}, [%4], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 + "ld1 {v2.8h}, [%2], #16 \n" + "umlal v0.4s, v2.4h, v7.4h \n" // * 6 + "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 + "ld1 {v2.8h}, [%3], #16 \n" + "umlal v0.4s, v2.4h, v6.4h \n" // * 4 + "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 "subs %w6, %w6, #8 \n" // 8 processed per loop - - "uaddl v0.4s, v1.4h, v5.4h \n" // * 1 - "uaddl2 v1.4s, v1.8h, v5.8h \n" // * 1 - - "umlal v0.4s, v2.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 - "umlal v0.4s, v3.4h, v7.4h \n" // * 6 - "umlal2 v1.4s, v3.8h, v7.8h \n" // * 6 - "umlal v0.4s, v4.4h, v6.4h \n" // * 4 - "umlal2 v1.4s, v4.8h, v6.8h \n" // * 4 - "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "b.gt 1b \n" @@ -2733,93 +2730,7 @@ void GaussCol_NEON(const uint16* src0, "+r"(dst), // %5 "+r"(width) // %6 : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -} - -#if 0 - a8: ad7f8d82 ldp q2, q3, [x12,#-16] - ac: 3cdf8186 ldur q6, [x12,#-8] - b0: 3cdf4184 ldur q4, [x12,#-12] - b4: 3cc04185 ldur q5, [x12,#4] - b8: 3cc08187 ldur q7, [x12,#8] - bc: 3cdfc190 ldur q16, [x12,#-4] - c0: 3cc0c191 ldur q17, [x12,#12] - c4: 3dc00592 ldr q18, [x12,#16] - c8: 4ea094c2 mla v2.4s, v6.4s, v0.4s #6 - cc: 4ea48604 add v4.4s, v16.4s, v4.4s - d0: 4ea58625 add v5.4s, v17.4s, v5.4s - d4: 4ea38442 add v2.4s, v2.4s, v3.4s - d8: 4ea094e3 mla v3.4s, v7.4s, v0.4s #6 - dc: 4f225484 shl v4.4s, v4.4s, #2 - e0: 4f2254a5 shl v5.4s, v5.4s, #2 - e4: 4eb28463 add v3.4s, v3.4s, v18.4s - e8: 4ea48442 add v2.4s, v2.4s, v4.4s - ec: 4ea58463 add v3.4s, v3.4s, v5.4s - f0: 4ea18442 add v2.4s, v2.4s, v1.4s #128 - f4: 4ea18463 add v3.4s, v3.4s, v1.4s #128 - f8: 0f188442 shrn v2.4h, v2.4s, #8 - fc: 0f188463 shrn v3.4h, v3.4s, #8 - 100: f10021ad subs x13, x13, #0x8 - 104: 6d3f8d62 stp d2, d3, [x11,#-8] - 108: 9100416b add x11, x11, #0x10 - 10c: 9100818c add x12, x12, #0x20 - 110: 54fffcc1 b.ne a8 - #endif - - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON3(const uint32* src, uint16* dst, int width) { - asm volatile( - "movi v0.4s, #6 \n" // constant 6 - "add %0, %0, #0x10 \n" - "add %1, %1, #0x8 \n" - - "1: \n" - - "ldp q2, q3, [%0,#-16] \n" - "ldur q6, [%0,#-8] \n" - "ldur q4, [%0,#-12] \n" - "ldur q5, [%0,#4] \n" - "ldur q7, [%0,#8] \n" - "ldur q16, [%0,#-4] \n" - "ldur q17, [%0,#12] \n" - "ldr q18, [%0,#16] \n" - "mla v2.4s, v6.4s, v0.4s \n" - "add v4.4s, v16.4s, v4.4s \n" - "add v5.4s, v17.4s, v5.4s \n" - "add v2.4s, v2.4s, v3.4s \n" - "mla v3.4s, v7.4s, v0.4s \n" - "shl v4.4s, v4.4s, #2 \n" - "shl v5.4s, v5.4s, #2 \n" - "add v3.4s, v3.4s, v18.4s \n" - "add v2.4s, v2.4s, v4.4s \n" - "add v3.4s, v3.4s, v5.4s \n" - "add v2.4s, v2.4s, v1.4s \n" - "add v3.4s, v3.4s, v1.4s \n" - "shrn v2.4h, v2.4s, #8 \n" - "shrn v3.4h, v3.4s, #8 \n" - "subs %w2, %w2, #0x8 \n" - "stp d2, d3, [%1,#-8] \n" - "add %1, %1, #0x10 \n" - "add %0, %0, #0x20 \n" - "b.gt 1b \n" - - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18" ); -} - - -void GaussRow_NEON2(const uint32* src, uint16* dst, int width) { - int i; - for (i = 0; i < width; ++i) { - *dst++ = - (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; - ++src; - } + : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. @@ -2832,19 +2743,19 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) { "movi v7.4s, #6 \n" // constant 6 "1: \n" - "ld1 {v1.4s,v2.4s,v3.4s}, [%0], %6 \n" // load 12 source samples - "ld1 {v4.4s,v5.4s}, [%1], #32 \n" - "ld1 {v16.4s,v17.4s}, [%2], #32 \n" - "ld1 {v18.4s,v19.4s}, [%3], #32 \n" - "subs %w5, %w5, #8 \n" // 8 processed per loop - "add v0.4s, v1.4s, v2.4s \n" // * 1 - "add v1.4s, v2.4s, v3.4s \n" // * 1 - "add v2.4s, v4.4s, v18.4s \n" // add rows for * 4 - "add v3.4s, v5.4s, v19.4s \n" + "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples + "add v0.4s, v0.4s, v1.4s \n" // * 1 + "add v1.4s, v1.4s, v2.4s \n" // * 1 + "ld1 {v2.4s,v3.4s}, [%2], #32 \n" + "mla v0.4s, v2.4s, v7.4s \n" // * 6 + "mla v1.4s, v3.4s, v7.4s \n" // * 6 + "ld1 {v2.4s,v3.4s}, [%1], #32 \n" + "ld1 {v4.4s,v5.4s}, [%3], #32 \n" + "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 + "add v3.4s, v3.4s, v5.4s \n" "mla v0.4s, v2.4s, v6.4s \n" // * 4 "mla v1.4s, v3.4s, v6.4s \n" // * 4 - "mla v0.4s, v16.4s, v7.4s \n" // * 6 - "mla v1.4s, v17.4s, v7.4s \n" // * 6 + "subs %w5, %w5, #8 \n" // 8 processed per loop "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack "uqrshrn2 v0.8h, v1.4s, #8 \n" "st1 {v0.8h}, [%4], #16 \n" // store 8 samples @@ -2856,44 +2767,8 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) { "+r"(src3), // %3 "+r"(dst), // %4 "+r"(width) // %5 - : "r"(32LL) // %6 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19" ); -} - -// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_NEON4(const uint32* src, uint16* dst, int width) { - const uint32* src1 = src + 1; - const uint32* src2 = src + 2; - const uint32* src3 = src + 3; - asm volatile( - "movi v6.4s, #4 \n" // constant 4 - "movi v7.4s, #6 \n" // constant 6 - - "1: \n" - "ld1 {v0.4s,v1.4s}, [%0], %6 \n" // load 8 source samples - "ld1 {v2.4s}, [%1], #16 \n" - "ld1 {v3.4s}, [%2], #16 \n" - "ld1 {v4.4s}, [%3], #16 \n" - "subs %w5, %w5, #4 \n" // 4 processed per loop - - "mla v0.4s, v2.4s, v6.4s \n" // * 4 - "mla v0.4s, v3.4s, v7.4s \n" // * 6 - "mla v0.4s, v4.4s, v6.4s \n" // * 4 - "add v0.4s, v0.4s, v1.4s \n" // * 1 - "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack - - "st1 {v0.4h}, [%4], #8 \n" // store 8 samples - "b.gt 1b \n" - - : "+r"(src), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(dst), // %4 - "+r"(width) // %5 - : "r"(16LL) // %6 - : "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" ); + : "r"(32LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 7744e876b..aaae80fe1 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2759,8 +2759,9 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - EXPECT_EQ(dst_pixels_c[0], 0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1); - EXPECT_EQ(dst_pixels_c[1279], 20496); + EXPECT_EQ(dst_pixels_c[0], + static_cast(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1)); + EXPECT_EQ(dst_pixels_c[1279], static_cast(20496)); } extern "C" void GaussCol_NEON(const uint16* src0, @@ -2819,8 +2820,9 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { } EXPECT_EQ(dst_pixels_c[0], - 0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + 1280 * 4 * 1); - EXPECT_EQ(dst_pixels_c[1279], 61424); + static_cast(0 * 1 + 1280 * 4 + 1280 * 2 * 6 + 1280 * 3 * 4 + + 1280 * 4 * 1)); + EXPECT_EQ(dst_pixels_c[1279], static_cast(61424)); } } // namespace libyuv