mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
HammingDistance_X86 using popcnt assembly
popcnt has a fake dependency on the destination. This assembly avoids the dependency by using a different register for each popcnt. Bug: libyuv:701 Test: LIBYUV_DISABLE_SSSE3=1 out/Release/libyuv_unittest --gtest_filter=*Ham*Opt --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=9999 --libyuv_flags=-1 --libyuv_cpu_info=-1 Change-Id: Ie1d202e2613b7fa8a3c02acd433940e92c80eafa Reviewed-on: https://chromium-review.googlesource.com/731826 Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
3e5bbea5bf
commit
80077a80c2
@ -60,7 +60,7 @@ extern "C" {
|
|||||||
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
|
(defined(__x86_64__) || defined(__i386__) || defined(_M_IX86))
|
||||||
#define HAS_HASHDJB2_SSE41
|
#define HAS_HASHDJB2_SSE41
|
||||||
#define HAS_SUMSQUAREERROR_SSE2
|
#define HAS_SUMSQUAREERROR_SSE2
|
||||||
#define HAS_HAMMINGDISTANCE_X86
|
#define HAS_HAMMINGDISTANCE_SSE42
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available for Visual C and clangcl 32 bit:
|
// The following are available for Visual C and clangcl 32 bit:
|
||||||
@ -98,7 +98,7 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_SSE42(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||||
|
|||||||
@ -130,16 +130,16 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
|||||||
HammingDistance = HammingDistance_NEON;
|
HammingDistance = HammingDistance_NEON;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_HAMMINGDISTANCE_X86)
|
|
||||||
if (TestCpuFlag(kCpuHasX86)) {
|
|
||||||
HammingDistance = HammingDistance_X86;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
|
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
HammingDistance = HammingDistance_SSSE3;
|
HammingDistance = HammingDistance_SSSE3;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_HAMMINGDISTANCE_SSE42)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE42)) {
|
||||||
|
HammingDistance = HammingDistance_SSE42;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
HammingDistance = HammingDistance_AVX2;
|
HammingDistance = HammingDistance_AVX2;
|
||||||
|
|||||||
@ -22,18 +22,92 @@ extern "C" {
|
|||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||||
|
|
||||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
|
#if defined(__x86_64__)
|
||||||
|
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||||
|
const uint8* src_b,
|
||||||
|
int count) {
|
||||||
|
uint64 diff = 0u;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"xor %%r15,%%r15 \n"
|
||||||
|
"xor %%r14,%%r14 \n"
|
||||||
|
"xor %%r13,%%r13 \n"
|
||||||
|
"xor %%r12,%%r12 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"mov (%0),%%rax \n"
|
||||||
|
"mov 0x8(%0),%%rdx \n"
|
||||||
|
"xor (%1),%%rax \n"
|
||||||
|
"xor 0x8(%1),%%rdx \n"
|
||||||
|
"popcnt %%rax,%%rax \n"
|
||||||
|
"popcnt %%rdx,%%rdx \n"
|
||||||
|
"mov 0x10(%0),%%rcx \n"
|
||||||
|
"mov 0x18(%0),%%rsi \n"
|
||||||
|
"xor 0x10(%1),%%rcx \n"
|
||||||
|
"xor 0x18(%1),%%rsi \n"
|
||||||
|
"popcnt %%rcx,%%rcx \n"
|
||||||
|
"popcnt %%rsi,%%rsi \n"
|
||||||
|
"add $0x20,%0 \n"
|
||||||
|
"add $0x20,%1 \n"
|
||||||
|
"add %%rax,%%r15 \n"
|
||||||
|
"add %%rdx,%%r14 \n"
|
||||||
|
"add %%rcx,%%r13 \n"
|
||||||
|
"add %%rsi,%%r12 \n"
|
||||||
|
"sub $0x20,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
|
||||||
|
"add %%r15, %%r14 \n"
|
||||||
|
"add %%r13, %%r12 \n"
|
||||||
|
"add %%r14, %%r12 \n"
|
||||||
|
"mov %%r12, %3 \n"
|
||||||
|
: "+r"(src_a), // %0
|
||||||
|
"+r"(src_b), // %1
|
||||||
|
"+r"(count), // %2
|
||||||
|
"=r"(diff) // %3
|
||||||
|
:
|
||||||
|
: "memory", "cc", "rax", "rdx", "rcx", "rsi", "r12", "r13", "r14", "r15");
|
||||||
|
|
||||||
|
return static_cast<uint32>(diff);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||||
|
const uint8* src_b,
|
||||||
|
int count) {
|
||||||
uint32 diff = 0u;
|
uint32 diff = 0u;
|
||||||
|
|
||||||
int i;
|
asm volatile(LABELALIGN
|
||||||
for (i = 0; i < count - 7; i += 8) {
|
"1: \n"
|
||||||
uint64 x = *((uint64*)src_a) ^ *((uint64*)src_b);
|
"mov (%0),%%eax \n"
|
||||||
src_a += 8;
|
"mov 0x4(%0),%%edx \n"
|
||||||
src_b += 8;
|
"xor (%1),%%eax \n"
|
||||||
diff += __builtin_popcountll(x);
|
"xor 0x4(%1),%%edx \n"
|
||||||
}
|
"popcnt %%eax,%%eax \n"
|
||||||
|
"add %%eax,%3 \n"
|
||||||
|
"popcnt %%edx,%%edx \n"
|
||||||
|
"add %%edx,%3 \n"
|
||||||
|
"mov 0x8(%0),%%eax \n"
|
||||||
|
"mov 0xc(%0),%%edx \n"
|
||||||
|
"xor 0x8(%1),%%eax \n"
|
||||||
|
"xor 0xc(%1),%%edx \n"
|
||||||
|
"popcnt %%eax,%%eax \n"
|
||||||
|
"add %%eax,%3 \n"
|
||||||
|
"popcnt %%edx,%%edx \n"
|
||||||
|
"add %%edx,%3 \n"
|
||||||
|
"add $0x10,%0 \n"
|
||||||
|
"add $0x10,%1 \n"
|
||||||
|
"sub $0x10,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src_a), // %0
|
||||||
|
"+r"(src_b), // %1
|
||||||
|
"+r"(count), // %2
|
||||||
|
"+r"(diff) // %3
|
||||||
|
:
|
||||||
|
: "memory", "cc", "eax", "edx");
|
||||||
|
|
||||||
return diff;
|
return diff;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
|
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
|
||||||
15, 15, 15, 15, 15, 15, 15, 15};
|
15, 15, 15, 15, 15, 15, 15, 15};
|
||||||
|
|||||||
@ -25,7 +25,9 @@ extern "C" {
|
|||||||
// This module is for 32 bit Visual C x86 and clangcl
|
// This module is for 32 bit Visual C x86 and clangcl
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||||
|
|
||||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 HammingDistance_SSE42(const uint8* src_a,
|
||||||
|
const uint8* src_b,
|
||||||
|
int count) {
|
||||||
uint32 diff = 0u;
|
uint32 diff = 0u;
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
|
|||||||
@ -234,18 +234,29 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
|
|||||||
if (has_avx2) {
|
if (has_avx2) {
|
||||||
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
|
||||||
} else {
|
} else {
|
||||||
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
|
||||||
if (has_ssse3) {
|
if (has_sse42) {
|
||||||
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
|
||||||
} else {
|
} else {
|
||||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
||||||
|
if (has_ssse3) {
|
||||||
|
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
|
||||||
|
} else {
|
||||||
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(HAS_HAMMINGDISTANCE_X86)
|
#elif defined(HAS_HAMMINGDISTANCE_SSE42)
|
||||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
|
||||||
|
if (has_sse42) {
|
||||||
|
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
|
||||||
|
} else {
|
||||||
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
EXPECT_EQ(h0, h1);
|
EXPECT_EQ(h0, h1);
|
||||||
|
|
||||||
@ -328,59 +339,63 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848
|
|||||||
|
|
||||||
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
|
TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
|
||||||
uint32 h1 = 0;
|
uint32 h1 = 0;
|
||||||
align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
|
const int kMaxWidth =benchmark_width_ * benchmark_height_;
|
||||||
align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_);
|
align_buffer_page_end(src_a, kMaxWidth);
|
||||||
memset(src_a, 255u, benchmark_width_ * benchmark_height_);
|
align_buffer_page_end(src_b, kMaxWidth);
|
||||||
memset(src_b, 0, benchmark_width_ * benchmark_height_);
|
memset(src_a, 255u, kMaxWidth);
|
||||||
|
memset(src_b, 0u, kMaxWidth);
|
||||||
|
|
||||||
uint64 h0 = ComputeHammingDistance(src_a, src_b,
|
uint64 h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth);
|
||||||
benchmark_width_ * benchmark_height_);
|
EXPECT_EQ(kMaxWidth * 8ULL, h0);
|
||||||
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h0);
|
|
||||||
|
|
||||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
||||||
h1 = HammingDistance_NEON(src_a, src_b,
|
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
|
||||||
benchmark_width_ * benchmark_height_);
|
|
||||||
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
|
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
if (has_avx2) {
|
if (has_avx2) {
|
||||||
h1 = HammingDistance_AVX2(src_a, src_b,
|
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
|
||||||
benchmark_width_ * benchmark_height_);
|
|
||||||
} else {
|
} else {
|
||||||
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
|
||||||
if (has_ssse3) {
|
if (has_sse42) {
|
||||||
h1 = HammingDistance_SSSE3(src_a, src_b,
|
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
|
||||||
benchmark_width_ * benchmark_height_);
|
|
||||||
} else {
|
} else {
|
||||||
h1 = HammingDistance_X86(src_a, src_b,
|
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
||||||
benchmark_width_ * benchmark_height_);
|
if (has_ssse3) {
|
||||||
|
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
|
||||||
|
} else {
|
||||||
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#elif defined(HAS_HAMMINGDISTANCE_X86)
|
#elif defined(HAS_HAMMINGDISTANCE_SSE42)
|
||||||
h1 =
|
int has_sse42 = TestCpuFlag(kCpuHasSSE42);
|
||||||
HammingDistance_X86(src_a, src_b, benchmark_width_ * benchmark_height_);
|
if (has_sse42) {
|
||||||
|
h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth);
|
||||||
|
} else {
|
||||||
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
h1 = HammingDistance_C(src_a, src_b, benchmark_width_ * benchmark_height_);
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// A large count will cause the low level to potentially overflow so the
|
// A large count will cause the low level to potentially overflow so the
|
||||||
// result can not be expected to be correct.
|
// result can not be expected to be correct.
|
||||||
// TODO(fbarchard): Consider expecting the low 16 bits to match.
|
// TODO(fbarchard): Consider expecting the low 16 bits to match.
|
||||||
if ((benchmark_width_ * benchmark_height_) <= kMaxOptCount) {
|
if (kMaxWidth<= kMaxOptCount) {
|
||||||
EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8U, h1);
|
EXPECT_EQ(kMaxWidth * 8U, h1);
|
||||||
} else {
|
} else {
|
||||||
if (benchmark_width_ * benchmark_height_ * 8ULL !=
|
if (kMaxWidth * 8ULL != static_cast<uint64>(h1)) {
|
||||||
static_cast<uint64>(h1)) {
|
|
||||||
printf(
|
printf(
|
||||||
"warning - HammingDistance_Opt %u does not match %llu "
|
"warning - HammingDistance_Opt %u does not match %llu "
|
||||||
"but length of %u is longer than guaranteed.\n",
|
"but length of %u is longer than guaranteed.\n",
|
||||||
h1, benchmark_width_ * benchmark_height_ * 8ULL,
|
h1, kMaxWidth * 8ULL, kMaxWidth);
|
||||||
benchmark_width_ * benchmark_height_);
|
|
||||||
} else {
|
} else {
|
||||||
printf(
|
printf(
|
||||||
"warning - HammingDistance_Opt %u matches but length of %u "
|
"warning - HammingDistance_Opt %u matches but length of %u "
|
||||||
"is longer than guaranteed.\n",
|
"is longer than guaranteed.\n",
|
||||||
h1, benchmark_width_ * benchmark_height_);
|
h1, kMaxWidth);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user