mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
DJB2 hash with SSE4 pmulld
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/484002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@237 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
c5d44a0c5e
commit
ddf9051bd9
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 236
|
Version: 237
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 236
|
#define LIBYUV_VERSION 237
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,7 @@ namespace libyuv {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// hash seed of 5381 recommended.
|
||||||
// Internal C version of HashDjb2 with int sized count for efficiency.
|
// Internal C version of HashDjb2 with int sized count for efficiency.
|
||||||
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
||||||
uint32 hash = seed;
|
uint32 hash = seed;
|
||||||
@ -34,17 +35,250 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
|||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This module is for Visual C x86
|
||||||
|
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||||
|
|
||||||
|
#define HAS_HASHDJB2_SSE41
|
||||||
|
static const vec32 kMulL33 = {
|
||||||
|
0xEC41D4E1, // 33 * 33 * 33 * 33 * 33 * 33 * 33
|
||||||
|
33 * 33 * 33 * 33 * 33 * 33,
|
||||||
|
33 * 33 * 33 * 33 * 33,
|
||||||
|
33 * 33 * 33 * 33 * 1 };
|
||||||
|
static const vec32 kMulH33 = {
|
||||||
|
33 * 33 * 33,
|
||||||
|
33 * 33,
|
||||||
|
33,
|
||||||
|
1 };
|
||||||
|
static const vec32 kHash4x33 = { 33 * 33 * 33 * 33, 0, 0, 0 };
|
||||||
|
static const vec32 kHash8x33 = {
|
||||||
|
0x747C7101, // 33 * 33 * 33 * 33 * 33 * 33 * 33 * 33,
|
||||||
|
0, 0, 0 };
|
||||||
|
|
||||||
|
|
||||||
|
// hash0 = initial state
|
||||||
|
// hash1 = hash0 * 33 + src[0]
|
||||||
|
// hash2 = hash1 * 33 + src[1] = (hash0 * 33 + src[0]) * 33 + src[1]
|
||||||
|
// hash3 = hash2 * 33 + src[2] = (hash1 * 33 + src[1]) * 33 + src[2] =
|
||||||
|
// ((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]
|
||||||
|
// hash4 = hash3 * 33 + src[3] = (hash2 * 33 + src[2]) * 33 + src[3] =
|
||||||
|
// ((hash1 * 33 + src[1]) * 33 + src[2]) * 33 + src[3] =
|
||||||
|
// (((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]) * 33 + src[3]
|
||||||
|
|
||||||
|
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
|
||||||
|
// pmulld requires Studio2008
|
||||||
|
// does 8 at a time, unaligned
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov ecx, [esp + 8] // count
|
||||||
|
movd xmm0, [esp + 12] // seed
|
||||||
|
pxor xmm7, xmm7 // constant 0 for unpck
|
||||||
|
movdqa xmm4, kHash8x33
|
||||||
|
movdqa xmm5, kMulL33
|
||||||
|
movdqa xmm6, kMulH33
|
||||||
|
|
||||||
|
align 16
|
||||||
|
wloop:
|
||||||
|
movq xmm1, qword ptr [eax] // src[0-7]
|
||||||
|
lea eax, [eax + 8]
|
||||||
|
punpcklbw xmm1, xmm7
|
||||||
|
movdqa xmm3, xmm1
|
||||||
|
punpcklwd xmm1, xmm7
|
||||||
|
// pmulld xmm1, xmm5
|
||||||
|
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xCD
|
||||||
|
punpckhwd xmm3, xmm7
|
||||||
|
// pmulld xmm3, xmm6
|
||||||
|
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xDE
|
||||||
|
sub ecx, 8
|
||||||
|
// pmulld xmm0, xmm4 // hash *= 33 ^ 8
|
||||||
|
_asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xC4
|
||||||
|
paddd xmm1, xmm3 // add 2nd 4 to first 4
|
||||||
|
pshufd xmm2, xmm1, 14 // upper 2 dwords
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
pshufd xmm2, xmm1, 1
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
paddd xmm0, xmm1
|
||||||
|
jg wloop
|
||||||
|
|
||||||
|
movd eax, xmm0 // return hash
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HAS_HASHDJB2_ALIGNED_SSE41
|
||||||
|
static const vec32 kHash16x33 = { -1831214591, 0, 0, 0 }; // 33 ^ 16
|
||||||
|
static const vec32 kHashMul0 = {
|
||||||
|
204809697, // 33 ^ 15
|
||||||
|
-1555599935, // 33 ^ 14
|
||||||
|
994064801, // 33 ^ 13
|
||||||
|
1331628417, // 33 ^ 12
|
||||||
|
};
|
||||||
|
static const vec32 kHashMul1 = {
|
||||||
|
821255521, // 33 ^ 11
|
||||||
|
-2057521855, // 33 ^ 10
|
||||||
|
67801377, // 33 ^ 9
|
||||||
|
1954312449, // 33 ^ 8
|
||||||
|
};
|
||||||
|
static const vec32 kHashMul2 = {
|
||||||
|
-331229983, // 33 ^ 7
|
||||||
|
1291467969, // 33 ^ 6
|
||||||
|
39135393, // 33 ^ 5
|
||||||
|
1185921, // 33 ^ 4
|
||||||
|
};
|
||||||
|
static const vec32 kHashMul3 = {
|
||||||
|
35937, // 33 ^ 3
|
||||||
|
1089, // 33 ^ 2
|
||||||
|
33, // 33 ^ 1
|
||||||
|
1, // 33 ^ 0
|
||||||
|
};
|
||||||
|
|
||||||
|
// movzxbd xmm1, [eax] // SSE4.1 requires VS2010
|
||||||
|
// pmulld requires Studio2008
|
||||||
|
// does 16 at a time, aligned
|
||||||
|
// TODO(fbarchard): For SSE2 version use pmuludq
|
||||||
|
// pmulld xmm1, xmm5
|
||||||
|
// becomes
|
||||||
|
// movdqa xmm2, xmm1
|
||||||
|
// pmuludq xmm1, [33*33*33, 0, 33, 0]
|
||||||
|
// psrldq xmm2, 8
|
||||||
|
// pmuludq xmm2, [33*33, 0, 1, 0]
|
||||||
|
// paddd xmm1, xmm2
|
||||||
|
// pshufd xmm2, xmm1, 2
|
||||||
|
// paddd xmm1, xmm2
|
||||||
|
|
||||||
|
|
||||||
|
//27: 66 0F 38 40 C6 pmulld xmm0,xmm6
|
||||||
|
//44: 66 0F 38 40 DD pmulld xmm3,xmm5
|
||||||
|
//59: 66 0F 38 40 E5 pmulld xmm4,xmm5
|
||||||
|
//72: 66 0F 38 40 D5 pmulld xmm2,xmm5
|
||||||
|
//83: 66 0F 38 40 CD pmulld xmm1,xmm5
|
||||||
|
#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
|
||||||
|
_asm _emit 0x40 _asm _emit reg
|
||||||
|
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov ecx, [esp + 8] // count
|
||||||
|
movd xmm0, [esp + 12] // seed
|
||||||
|
|
||||||
|
pxor xmm7, xmm7 // constant 0 for unpck
|
||||||
|
movdqa xmm6, kHash16x33
|
||||||
|
|
||||||
|
align 16
|
||||||
|
wloop:
|
||||||
|
movdqa xmm1, [eax] // src[0-15]
|
||||||
|
lea eax, [eax + 16]
|
||||||
|
pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 8
|
||||||
|
movdqa xmm5, kHashMul0
|
||||||
|
movdqa xmm2, xmm1
|
||||||
|
punpcklbw xmm2, xmm7 // src[0-7]
|
||||||
|
movdqa xmm3, xmm2
|
||||||
|
punpcklwd xmm3, xmm7 // src[0-3]
|
||||||
|
pmulld(0xdd) // pmulld xmm3, xmm5
|
||||||
|
movdqa xmm5, kHashMul1
|
||||||
|
movdqa xmm4, xmm2
|
||||||
|
punpckhwd xmm4, xmm7 // src[4-7]
|
||||||
|
pmulld(0xe5) // pmulld xmm4, xmm5
|
||||||
|
movdqa xmm5, kHashMul2
|
||||||
|
punpckhbw xmm1, xmm7 // src[8-15]
|
||||||
|
movdqa xmm2, xmm1
|
||||||
|
punpcklwd xmm2, xmm7 // src[8-11]
|
||||||
|
pmulld(0xd5) // pmulld xmm2, xmm5
|
||||||
|
movdqa xmm5, kHashMul3
|
||||||
|
punpckhwd xmm1, xmm7 // src[12-15]
|
||||||
|
pmulld(0xcd) // pmulld xmm1, xmm5
|
||||||
|
paddd xmm3, xmm4 // add 16 results
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
sub ecx, 16
|
||||||
|
paddd xmm1, xmm3
|
||||||
|
|
||||||
|
pshufd xmm2, xmm1, 14 // upper 2 dwords
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
pshufd xmm2, xmm1, 1
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
paddd xmm0, xmm1
|
||||||
|
jg wloop
|
||||||
|
|
||||||
|
movd eax, xmm0 // return hash
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// This following works but is slower than movdqa version
|
||||||
|
// 66 0f 38 31 08 pmovzxbd xmm1, [eax]
|
||||||
|
// 66 0f 38 31 50 04 pmovzxbd xmm2, [eax + 4]
|
||||||
|
// 66 0f 38 31 58 08 pmovzxbd xmm3, [eax + 8]
|
||||||
|
// 66 0f 38 31 60 0c pmovzxbd xmm4, [eax + 12]
|
||||||
|
|
||||||
|
#define pmovzxbd0(rmem) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
|
||||||
|
_asm _emit 0x31 _asm _emit rmem
|
||||||
|
#define pmovzxbd(rmem0, rmem1) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \
|
||||||
|
_asm _emit 0x31 _asm _emit rmem0 _asm _emit rmem1
|
||||||
|
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
static uint32 HashDjb2_Unaligned_SSE41(const uint8* src, int count,
|
||||||
|
uint32 seed) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov ecx, [esp + 8] // count
|
||||||
|
movd xmm0, [esp + 12] // seed
|
||||||
|
|
||||||
|
movdqa xmm5, kHash16x33
|
||||||
|
|
||||||
|
align 16
|
||||||
|
wloop:
|
||||||
|
pmovzxbd0(0x08) // src[0-3] pmovzxbd xmm1, [eax]
|
||||||
|
pmulld xmm1, kHashMul0
|
||||||
|
pmovzxbd(0x50, 0x04) // src[4-7] pmovzxbd xmm2, [eax + 4]
|
||||||
|
pmulld xmm2, kHashMul1
|
||||||
|
pmovzxbd(0x58, 0x08) // src[8-11] pmovzxbd xmm3, [eax + 8]
|
||||||
|
pmulld xmm3, kHashMul2
|
||||||
|
pmovzxbd(0x60, 0x0c) // src[12-15] pmovzxbd xmm4, [eax + 12]
|
||||||
|
pmulld xmm4, kHashMul3
|
||||||
|
lea eax, [eax + 16]
|
||||||
|
pmulld xmm0, xmm5 // hash *= 33 ^ 8
|
||||||
|
paddd xmm1, xmm2 // add 16 results
|
||||||
|
paddd xmm3, xmm4
|
||||||
|
sub ecx, 16
|
||||||
|
paddd xmm1, xmm3
|
||||||
|
pshufd xmm2, xmm1, 14 // upper 2 dwords
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
pshufd xmm2, xmm1, 1
|
||||||
|
paddd xmm1, xmm2
|
||||||
|
paddd xmm0, xmm1
|
||||||
|
jg wloop
|
||||||
|
|
||||||
|
movd eax, xmm0 // return hash
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
// hash seed of 5381 recommended.
|
// hash seed of 5381 recommended.
|
||||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
||||||
|
uint32 (*Hash)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
|
||||||
|
#if defined(HAS_HASHDJB2_SSE41)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(count, 8)) {
|
||||||
|
Hash = HashDjb2_SSE41;
|
||||||
|
if (IS_ALIGNED(count, 16)) {
|
||||||
|
Hash = HashDjb2_Aligned_SSE41;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const int kBlockSize = 1 << 15; // 32768;
|
const int kBlockSize = 1 << 15; // 32768;
|
||||||
while (count >= static_cast<uint64>(kBlockSize)) {
|
while (count >= static_cast<uint64>(kBlockSize)) {
|
||||||
seed = HashDjb2_C(src, kBlockSize, seed);
|
seed = Hash(src, kBlockSize, seed);
|
||||||
src += kBlockSize;
|
src += kBlockSize;
|
||||||
count -= kBlockSize;
|
count -= kBlockSize;
|
||||||
}
|
}
|
||||||
int remainder = static_cast<int>(count) & ~15;
|
int remainder = static_cast<int>(count) & ~15;
|
||||||
if (remainder) {
|
if (remainder) {
|
||||||
seed = HashDjb2_C(src, remainder, seed);
|
seed = Hash(src, remainder, seed);
|
||||||
src += remainder;
|
src += remainder;
|
||||||
count -= remainder;
|
count -= remainder;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -42,10 +42,16 @@ TEST_F(libyuvTest, TestDjb2) {
|
|||||||
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
|
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
|
||||||
EXPECT_EQ(h1, h2);
|
EXPECT_EQ(h1, h2);
|
||||||
}
|
}
|
||||||
|
int h = 1;
|
||||||
|
for (int i = 0; i <= 16 ; ++i) {
|
||||||
|
printf("%d ", h);
|
||||||
|
h *= 33;
|
||||||
|
}
|
||||||
|
|
||||||
free_aligned_buffer_16(src_a)
|
free_aligned_buffer_16(src_a)
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(libyuvTest, BenchmakDjb2) {
|
TEST_F(libyuvTest, BenchmakDjb2_C) {
|
||||||
const int kMaxTest = 1280 * 720;
|
const int kMaxTest = 1280 * 720;
|
||||||
|
|
||||||
align_buffer_16(src_a, kMaxTest)
|
align_buffer_16(src_a, kMaxTest)
|
||||||
@ -53,10 +59,29 @@ TEST_F(libyuvTest, BenchmakDjb2) {
|
|||||||
src_a[i] = i;
|
src_a[i] = i;
|
||||||
}
|
}
|
||||||
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
|
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
|
||||||
|
uint32 h1;
|
||||||
|
MaskCpuFlags(kCpuInitialized);
|
||||||
for (int i = 0; i < _benchmark_iterations; ++i) {
|
for (int i = 0; i < _benchmark_iterations; ++i) {
|
||||||
uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
|
h1 = HashDjb2(src_a, kMaxTest, 5381);
|
||||||
EXPECT_EQ(h1, h2);
|
|
||||||
}
|
}
|
||||||
|
MaskCpuFlags(-1);
|
||||||
|
EXPECT_EQ(h1, h2);
|
||||||
|
free_aligned_buffer_16(src_a)
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(libyuvTest, BenchmakDjb2_OPT) {
|
||||||
|
const int kMaxTest = 1280 * 720;
|
||||||
|
|
||||||
|
align_buffer_16(src_a, kMaxTest)
|
||||||
|
for (int i = 0; i < kMaxTest; ++i) {
|
||||||
|
src_a[i] = i;
|
||||||
|
}
|
||||||
|
uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
|
||||||
|
uint32 h1;
|
||||||
|
for (int i = 0; i < _benchmark_iterations; ++i) {
|
||||||
|
h1 = HashDjb2(src_a, kMaxTest, 5381);
|
||||||
|
}
|
||||||
|
EXPECT_EQ(h1, h2);
|
||||||
free_aligned_buffer_16(src_a)
|
free_aligned_buffer_16(src_a)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user