mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Port HammingDistance to SSSE3
Bug: libyuv:701 Test: BenchmarkHammingDistance_Opt Change-Id: Ibdd5d382677ebef4f82a62e0d5c3b88614a3b6e4 Reviewed-on: https://chromium-review.googlesource.com/696290 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
bde789b176
commit
fecd741794
6
BUILD.gn
6
BUILD.gn
@ -160,9 +160,9 @@ static_library("libyuv_internal") {
|
||||
# To enable AVX2 or other cpu optimization, pass flag here
|
||||
if (!is_win) {
|
||||
cflags = [
|
||||
# "-mpopcnt",
|
||||
# "-mavx2",
|
||||
# "-mfma",
|
||||
# "-mpopcnt",
|
||||
# "-mavx2",
|
||||
# "-mfma",
|
||||
"-ffp-contract=fast", # Enable fma vectorization for NEON.
|
||||
]
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1671
|
||||
Version: 1672
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -70,7 +70,13 @@ extern "C" {
|
||||
#define HAS_SUMSQUAREERROR_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available for VGCC and clangcl 64 bit:
|
||||
// The following are available for GCC and clangcl 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_HAMMINGDISTANCE_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available for GCC and clangcl 64 bit:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_HAMMINGDISTANCE_AVX2
|
||||
@ -93,7 +99,7 @@ extern "C" {
|
||||
|
||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_SSSE3(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1671
|
||||
#define LIBYUV_VERSION 1672
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -131,6 +131,11 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
||||
HammingDistance = HammingDistance_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
HammingDistance = HammingDistance_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HammingDistance = HammingDistance_AVX2;
|
||||
|
||||
@ -35,15 +35,74 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
|
||||
return diff;
|
||||
}
|
||||
|
||||
#ifdef HAS_HAMMINGDISTANCE_AVX2
|
||||
static uint32 kNibbleMask = 0x0f0f0f0fu;
|
||||
static vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
|
||||
15, 15, 15, 15, 15, 15, 15, 15};
|
||||
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
|
||||
|
||||
uint32 HammingDistance_SSSE3(const uint8* src_a,
|
||||
const uint8* src_b,
|
||||
int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm2 \n"
|
||||
"movdqa %5,%%xmm3 \n"
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub %0,%1 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm4 \n"
|
||||
"movdqa 0x10(%0), %%xmm5 \n"
|
||||
"pxor (%0,%1), %%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"pand %%xmm2,%%xmm6 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm6,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm6 \n"
|
||||
"pshufb %%xmm4,%%xmm6 \n"
|
||||
"paddb %%xmm7,%%xmm6 \n"
|
||||
"pxor 0x10(%0,%1),%%xmm5 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"movdqa %%xmm5,%%xmm4 \n"
|
||||
"pand %%xmm2,%%xmm5 \n"
|
||||
"psrlw $0x4,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"pshufb %%xmm5,%%xmm7 \n"
|
||||
"pand %%xmm2,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm5 \n"
|
||||
"pshufb %%xmm4,%%xmm5 \n"
|
||||
"paddb %%xmm7,%%xmm5 \n"
|
||||
"paddb %%xmm5,%%xmm6 \n"
|
||||
"psadbw %%xmm1,%%xmm6 \n"
|
||||
"paddd %%xmm6,%%xmm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xaa,%%xmm0,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"vmovd %%xmm0, %3 \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
#ifdef HAS_HAMMINGDISTANCE_AVX2
|
||||
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff = 0u;
|
||||
|
||||
asm volatile(
|
||||
"vbroadcastss %4,%%ymm2 \n"
|
||||
"vbroadcastf128 %4,%%ymm2 \n"
|
||||
"vbroadcastf128 %5,%%ymm3 \n"
|
||||
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
@ -83,7 +142,7 @@ uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
"=g"(diff) // %3
|
||||
"=r"(diff) // %3
|
||||
: "m"(kNibbleMask), // %4
|
||||
"m"(kBitCount) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
|
||||
@ -636,10 +636,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
"stp q0, q1, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(count) // %2 // Output registers
|
||||
: // Input registers
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(count) // %2 // Output registers
|
||||
: // Input registers
|
||||
: "cc", "memory", "v0", "v1" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
@ -234,7 +234,12 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
|
||||
if (has_avx2) {
|
||||
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
|
||||
} else {
|
||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
||||
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
||||
if (has_ssse3) {
|
||||
h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth);
|
||||
} else {
|
||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_HAMMINGDISTANCE_X86)
|
||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user