mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Hamming Distance SSE2 and AVX2 optimized
Bug: None Test: None Change-Id: Id52663f9c957aac3172fba92d888ad1b041d5cf0 Reviewed-on: https://chromium-review.googlesource.com/692981 Reviewed-by: Cheng Wang <wangcheng@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
311add63c2
commit
bde789b176
8
BUILD.gn
8
BUILD.gn
@ -158,9 +158,13 @@ static_library("libyuv_internal") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# To enable AVX2 or other cpu optimization, pass flag here
|
# To enable AVX2 or other cpu optimization, pass flag here
|
||||||
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
|
|
||||||
if (!is_win) {
|
if (!is_win) {
|
||||||
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
|
cflags = [
|
||||||
|
# "-mpopcnt",
|
||||||
|
# "-mavx2",
|
||||||
|
# "-mfma",
|
||||||
|
"-ffp-contract=fast", # Enable fma vectorization for NEON.
|
||||||
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (libyuv_use_neon) {
|
if (libyuv_use_neon) {
|
||||||
|
|||||||
@ -49,6 +49,7 @@ extern "C" {
|
|||||||
// #define DISABLE_CLANG_MSA 1
|
// #define DISABLE_CLANG_MSA 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// The following are available for Visual C:
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
|
||||||
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
|
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
|
||||||
#define HAS_HASHDJB2_AVX2
|
#define HAS_HASHDJB2_AVX2
|
||||||
@ -69,6 +70,12 @@ extern "C" {
|
|||||||
#define HAS_SUMSQUAREERROR_AVX2
|
#define HAS_SUMSQUAREERROR_AVX2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// The following are available for VGCC and clangcl 64 bit:
|
||||||
|
#if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \
|
||||||
|
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||||
|
#define HAS_HAMMINGDISTANCE_AVX2
|
||||||
|
#endif
|
||||||
|
|
||||||
// The following are available for Neon:
|
// The following are available for Neon:
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||||
@ -86,6 +93,8 @@ extern "C" {
|
|||||||
|
|
||||||
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count);
|
||||||
|
uint32 HammingDistance_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||||
|
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||||
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
|
uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count);
|
||||||
|
|
||||||
|
|||||||
@ -115,7 +115,8 @@ uint64 ComputeHammingDistance(const uint8* src_a,
|
|||||||
const uint8* src_b,
|
const uint8* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
const int kBlockSize = 65536;
|
const int kBlockSize = 65536;
|
||||||
int remainder = count & (kBlockSize - 1) & ~31;
|
// SIMD for multiple of 64, and C for remainder
|
||||||
|
int remainder = count & (kBlockSize - 1) & ~63;
|
||||||
uint64 diff = 0;
|
uint64 diff = 0;
|
||||||
int i;
|
int i;
|
||||||
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
|
uint32 (*HammingDistance)(const uint8* src_a, const uint8* src_b, int count) =
|
||||||
|
|||||||
@ -35,6 +35,63 @@ uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count) {
|
|||||||
return diff;
|
return diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef HAS_HAMMINGDISTANCE_AVX2
|
||||||
|
static uint32 kNibbleMask = 0x0f0f0f0fu;
|
||||||
|
static vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
|
||||||
|
|
||||||
|
uint32 HammingDistance_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
|
uint32 diff = 0u;
|
||||||
|
|
||||||
|
asm volatile(
|
||||||
|
"vbroadcastss %4,%%ymm2 \n"
|
||||||
|
"vbroadcastf128 %5,%%ymm3 \n"
|
||||||
|
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
||||||
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqa (%0),%%ymm4 \n"
|
||||||
|
"vmovdqa 0x20(%0), %%ymm5 \n"
|
||||||
|
"vpxor (%0,%1), %%ymm4, %%ymm4 \n"
|
||||||
|
"vpand %%ymm2,%%ymm4,%%ymm6 \n"
|
||||||
|
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
|
||||||
|
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||||
|
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
|
||||||
|
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
|
||||||
|
"add $0x40,%0 \n"
|
||||||
|
"vpand %%ymm2,%%ymm4,%%ymm5 \n"
|
||||||
|
"vpsrlw $0x4,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
|
||||||
|
"vpand %%ymm2,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
|
||||||
|
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
|
||||||
|
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
|
||||||
|
"sub $0x40,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
|
||||||
|
"vpermq $0xb1,%%ymm0,%%ymm1 \n"
|
||||||
|
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
|
||||||
|
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
|
"vmovd %%xmm0, %3 \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_a), // %0
|
||||||
|
"+r"(src_b), // %1
|
||||||
|
"+r"(count), // %2
|
||||||
|
"=g"(diff) // %3
|
||||||
|
: "m"(kNibbleMask), // %4
|
||||||
|
"m"(kBitCount) // %5
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||||
|
|
||||||
|
return diff;
|
||||||
|
}
|
||||||
|
#endif // HAS_HAMMINGDISTANCE_AVX2
|
||||||
|
|
||||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||||
uint32 sse;
|
uint32 sse;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
|
|||||||
@ -229,13 +229,19 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
|
|||||||
for (int i = 0; i < count; ++i) {
|
for (int i = 0; i < count; ++i) {
|
||||||
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
#if defined(HAS_HAMMINGDISTANCE_NEON)
|
||||||
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth);
|
||||||
|
#elif defined(HAS_HAMMINGDISTANCE_AVX2)
|
||||||
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
|
if (has_avx2) {
|
||||||
|
h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth);
|
||||||
|
} else {
|
||||||
|
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
||||||
|
}
|
||||||
#elif defined(HAS_HAMMINGDISTANCE_X86)
|
#elif defined(HAS_HAMMINGDISTANCE_X86)
|
||||||
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_X86(src_a, src_b, kMaxWidth);
|
||||||
#else
|
#else
|
||||||
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
h1 = HammingDistance_C(src_a, src_b, kMaxWidth);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPECT_EQ(h0, h1);
|
EXPECT_EQ(h0, h1);
|
||||||
|
|
||||||
free_aligned_buffer_page_end(src_a);
|
free_aligned_buffer_page_end(src_a);
|
||||||
|
|||||||
@ -2878,13 +2878,13 @@ float TestCopySamples(int benchmark_width,
|
|||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, TestCopySamples_C) {
|
TEST_F(LibYUVPlanarTest, TestCopySamples_C) {
|
||||||
float diff = TestCopySamples(benchmark_width_, benchmark_height_,
|
float diff = TestCopySamples(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, false);
|
benchmark_iterations_, false);
|
||||||
EXPECT_EQ(0, diff);
|
EXPECT_EQ(0, diff);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) {
|
TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) {
|
||||||
float diff = TestCopySamples(benchmark_width_, benchmark_height_,
|
float diff = TestCopySamples(benchmark_width_, benchmark_height_,
|
||||||
benchmark_iterations_, true);
|
benchmark_iterations_, true);
|
||||||
EXPECT_EQ(0, diff);
|
EXPECT_EQ(0, diff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user