diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index e78d742be..4ab4b49ad 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -78,12 +78,13 @@ extern "C" { // The following are available for Neon: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SUMSQUAREERROR_NEON #define HAS_HAMMINGDISTANCE_NEON +#define HAS_SUMSQUAREERROR_NEON #endif // The following are available for AArch64 Neon: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_HAMMINGDISTANCE_NEON_DOTPROD #define HAS_SUMSQUAREERROR_NEON_DOTPROD #endif @@ -107,6 +108,9 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count); +uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, + const uint8_t* src_b, + int count); uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); diff --git a/source/compare.cc b/source/compare.cc index e128dfabc..3783baea2 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -134,6 +134,11 @@ uint64_t ComputeHammingDistance(const uint8_t* src_a, HammingDistance = HammingDistance_NEON; } #endif +#if defined(HAS_HAMMINGDISTANCE_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + HammingDistance = HammingDistance_NEON_DotProd; + } +#endif #if defined(HAS_HAMMINGDISTANCE_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { HammingDistance = HammingDistance_SSSE3; diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 0a4d52393..0b378c632 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -86,6 +86,38 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, return sse; } +uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, + const uint8_t* src_b, + int count) { + uint32_t diff; + asm volatile( + "movi v4.4s, #0 \n" + "movi v5.4s, #0 \n" + "movi v6.16b, #1 \n" + + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "ldp q2, q3, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "prfm pldl1keep, [%1, 448] \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "udot v4.4s, v0.16b, v6.16b \n" + "udot v5.4s, v1.16b, v6.16b \n" + "b.gt 1b \n" + + "add v0.4s, v4.4s, v5.4s \n" + "addv s0, v0.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + return diff; +} + uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, const uint8_t* src_b, int count) {