diff --git a/README.chromium b/README.chromium index ff9953eba..8daeba949 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 237 +Version: 238 License: BSD License File: LICENSE diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index d3adf51ad..81bc84414 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -23,10 +23,11 @@ static const int kCpuHasSSSE3 = 4; static const int kCpuHasSSE41 = 8; // These flags are only valid on ARM processors -static const int kCpuHasNEON = 16; +static const int kCpuHasARM = 16; +static const int kCpuHasNEON = 32; // Internal flag to indicate cpuid is initialized. -static const int kCpuInitialized = 32; +static const int kCpuInitialized = 64; // Detect CPU has SSE2 etc. // test_flag parameter should be one of kCpuHas constants above diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cee7336af..fb97ce6da 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 237 +#define LIBYUV_VERSION 238 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index 4612a29fb..05b0616a4 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -37,117 +37,33 @@ static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { // This module is for Visual C x86 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) - #define HAS_HASHDJB2_SSE41 -static const vec32 kMulL33 = { - 0xEC41D4E1, // 33 * 33 * 33 * 33 * 33 * 33 * 33 - 33 * 33 * 33 * 33 * 33 * 33, - 33 * 33 * 33 * 33 * 33, - 33 * 33 * 33 * 33 * 1 }; -static const vec32 kMulH33 = { - 33 * 33 * 33, - 33 * 33, - 33, - 1 }; -static const vec32 kHash4x33 = { 33 * 33 * 33 * 33, 0, 0, 0 }; -static const vec32 kHash8x33 = { - 0x747C7101, // 33 * 33 * 33 * 33 * 33 * 33 * 33 * 33, - 0, 0, 0 }; - - -// hash0 = initial state -// hash1 = hash0 * 33 + src[0] -// hash2 = hash1 * 33 + src[1] = (hash0 * 33 + src[0]) * 33 + src[1] -// hash3 = hash2 * 33 + src[2] = (hash1 * 33 + src[1]) * 33 + src[2] = -// ((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2] -// hash4 = hash3 * 33 + src[3] = (hash2 * 33 + src[2]) * 33 + src[3] = -// ((hash1 * 33 + src[1]) * 33 + src[2]) * 33 + src[3] = -// (((hash0 * 33 + src[0]) * 33 + src[1]) * 33 + src[2]) * 33 + src[3] - -// movzxbd xmm1, [eax] // SSE4.1 requires VS2010 -// pmulld requires Studio2008 -// does 8 at a time, unaligned -__declspec(naked) __declspec(align(16)) -static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm4, kHash8x33 - movdqa xmm5, kMulL33 - movdqa xmm6, kMulH33 - - align 16 - wloop: - movq xmm1, qword ptr [eax] // src[0-7] - lea eax, [eax + 8] - punpcklbw xmm1, xmm7 - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 - // pmulld xmm1, xmm5 - _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xCD - punpckhwd xmm3, xmm7 - // pmulld xmm3, xmm6 - _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xDE - sub ecx, 8 - // pmulld xmm0, xmm4 // hash *= 33 ^ 8 - _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 _asm _emit 0x40 _asm _emit 0xC4 - paddd xmm1, xmm3 // add 2nd 4 to first 4 - pshufd xmm2, xmm1, 14 // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 1 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} - -#define HAS_HASHDJB2_ALIGNED_SSE41 -static const vec32 kHash16x33 = { -1831214591, 0, 0, 0 }; // 33 ^ 16 -static const vec32 kHashMul0 = { - 204809697, // 33 ^ 15 - -1555599935, // 33 ^ 14 - 994064801, // 33 ^ 13 - 1331628417, // 33 ^ 12 +static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static const uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 }; -static const vec32 kHashMul1 = { - 821255521, // 33 ^ 11 - -2057521855, // 33 ^ 10 - 67801377, // 33 ^ 9 - 1954312449, // 33 ^ 8 +static const uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 }; -static const vec32 kHashMul2 = { - -331229983, // 33 ^ 7 - 1291467969, // 33 ^ 6 - 39135393, // 33 ^ 5 - 1185921, // 33 ^ 4 +static const uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 }; -static const vec32 kHashMul3 = { - 35937, // 33 ^ 3 - 1089, // 33 ^ 2 - 33, // 33 ^ 1 - 1, // 33 ^ 0 +static const uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 }; -// movzxbd xmm1, [eax] // SSE4.1 requires VS2010 -// pmulld requires Studio2008 -// does 16 at a time, aligned -// TODO(fbarchard): For SSE2 version use pmuludq -// pmulld xmm1, xmm5 -// becomes -// movdqa xmm2, xmm1 -// pmuludq xmm1, [33*33*33, 0, 33, 0] -// psrldq xmm2, 8 -// pmuludq xmm2, [33*33, 0, 1, 0] -// paddd xmm1, xmm2 -// pshufd xmm2, xmm1, 2 -// paddd xmm1, xmm2 - - //27: 66 0F 38 40 C6 pmulld xmm0,xmm6 //44: 66 0F 38 40 DD pmulld xmm3,xmm5 //59: 66 0F 38 40 E5 pmulld xmm4,xmm5 @@ -157,7 +73,7 @@ static const vec32 kHashMul3 = { _asm _emit 0x40 _asm _emit reg __declspec(naked) __declspec(align(16)) -static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { +static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count @@ -168,7 +84,7 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { align 16 wloop: - movdqa xmm1, [eax] // src[0-15] + movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 8 movdqa xmm5, kHashMul0 @@ -205,80 +121,26 @@ static uint32 HashDjb2_Aligned_SSE41(const uint8* src, int count, uint32 seed) { ret } } - -#if 0 -// This following works but is slower than movdqa version -// 66 0f 38 31 08 pmovzxbd xmm1, [eax] -// 66 0f 38 31 50 04 pmovzxbd xmm2, [eax + 4] -// 66 0f 38 31 58 08 pmovzxbd xmm3, [eax + 8] -// 66 0f 38 31 60 0c pmovzxbd xmm4, [eax + 12] - -#define pmovzxbd0(rmem) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \ - _asm _emit 0x31 _asm _emit rmem -#define pmovzxbd(rmem0, rmem1) _asm _emit 0x66 _asm _emit 0x0f _asm _emit 0x38 \ - _asm _emit 0x31 _asm _emit rmem0 _asm _emit rmem1 - -__declspec(naked) __declspec(align(16)) -static uint32 HashDjb2_Unaligned_SSE41(const uint8* src, int count, - uint32 seed) { - __asm { - mov eax, [esp + 4] // src - mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - - movdqa xmm5, kHash16x33 - - align 16 - wloop: - pmovzxbd0(0x08) // src[0-3] pmovzxbd xmm1, [eax] - pmulld xmm1, kHashMul0 - pmovzxbd(0x50, 0x04) // src[4-7] pmovzxbd xmm2, [eax + 4] - pmulld xmm2, kHashMul1 - pmovzxbd(0x58, 0x08) // src[8-11] pmovzxbd xmm3, [eax + 8] - pmulld xmm3, kHashMul2 - pmovzxbd(0x60, 0x0c) // src[12-15] pmovzxbd xmm4, [eax + 12] - pmulld xmm4, kHashMul3 - lea eax, [eax + 16] - pmulld xmm0, xmm5 // hash *= 33 ^ 8 - paddd xmm1, xmm2 // add 16 results - paddd xmm3, xmm4 - sub ecx, 16 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 14 // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 1 - paddd xmm1, xmm2 - paddd xmm0, xmm1 - jg wloop - - movd eax, xmm0 // return hash - ret - } -} -#endif - #endif // hash seed of 5381 recommended. uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { - uint32 (*Hash)(const uint8* src, int count, uint32 seed) = HashDjb2_C; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) - if (TestCpuFlag(kCpuHasSSE41) && IS_ALIGNED(count, 8)) { - Hash = HashDjb2_SSE41; - if (IS_ALIGNED(count, 16)) { - Hash = HashDjb2_Aligned_SSE41; - } + if (TestCpuFlag(kCpuHasSSE41)) { + HashDjb2_SSE = HashDjb2_SSE41; } #endif + const int kBlockSize = 1 << 15; // 32768; while (count >= static_cast(kBlockSize)) { - seed = Hash(src, kBlockSize, seed); + seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } int remainder = static_cast(count) & ~15; if (remainder) { - seed = Hash(src, remainder, seed); + seed = HashDjb2_SSE(src, remainder, seed); src += remainder; count -= remainder; } diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 63fedcb80..d83a88a55 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -91,15 +91,18 @@ int InitCpuFlags() { if (getenv("LIBYUV_DISABLE_SSE41")) { cpu_info_ &= ~kCpuHasSSE41; } + if (getenv("LIBYUV_DISABLE_ASM")) { + cpu_info_ = kCpuInitialized; + } #elif defined(__linux__) && defined(__ARM_NEON__) cpu_info_ = ArmCpuCaps("/proc/cpuinfo") | kCpuInitialized; #elif defined(__ARM_NEON__) // gcc -mfpu=neon defines __ARM_NEON__ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags // to disable Neon on devices that do not have it. - cpu_info_ = kCpuHasNEON | kCpuInitialized; + cpu_info_ = kCpuHasNEON | kCpuInitialized | kCpuHasARM; #else - cpu_info_ = kCpuInitialized; + cpu_info_ = kCpuInitialized | kCpuHasARM; #endif return cpu_info_; } diff --git a/source/row.h b/source/row.h index 46fda7acb..2b68ae016 100644 --- a/source/row.h +++ b/source/row.h @@ -89,13 +89,13 @@ extern "C" { typedef __declspec(align(16)) int8 vec8[16]; typedef __declspec(align(16)) uint8 uvec8[16]; typedef __declspec(align(16)) int16 vec16[8]; -typedef __declspec(align(16)) int32 vec32[4]; +typedef __declspec(align(16)) uint32 uvec32[4]; #else // __GNUC__ #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) typedef int8 __attribute__((vector_size(16))) vec8; typedef uint8 __attribute__((vector_size(16))) uvec8; typedef int16 __attribute__((vector_size(16))) vec16; -typedef int32 __attribute__((vector_size(16))) vec32; +typedef uint32 __attribute__((vector_size(16))) uvec32; #endif void I420ToARGBRow_NEON(const uint8* y_buf, diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 201b321b6..d5400b07e 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -42,11 +42,13 @@ TEST_F(libyuvTest, TestDjb2) { uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); EXPECT_EQ(h1, h2); } + // Hash constant generator using for tables in compare int h = 1; for (int i = 0; i <= 16 ; ++i) { - printf("%d ", h); + printf("%08x ", h); h *= 33; } + printf("\n"); free_aligned_buffer_16(src_a) } @@ -85,6 +87,22 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) { free_aligned_buffer_16(src_a) } +TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) { + const int kMaxTest = 1280 * 720; + + align_buffer_16(src_a, kMaxTest + 1) + for (int i = 0; i < kMaxTest; ++i) { + src_a[i + 1] = i; + } + uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381); + uint32 h1; + for (int i = 0; i < _benchmark_iterations; ++i) { + h1 = HashDjb2(src_a + 1, kMaxTest, 5381); + } + EXPECT_EQ(h1, h2); + free_aligned_buffer_16(src_a) +} + TEST_F(libyuvTest, BenchmarkSumSquareError_C) { const int max_width = 4096*3; diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index e4b62fb5b..5c0f83be1 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -35,6 +35,10 @@ TEST_F(libyuvTest, TestCpuHas) { #if LIBYUV_VERSION >= 236 int has_sse41 = TestCpuFlag(kCpuHasSSE41); printf("Has SSE4.1 %d\n", has_sse41); +#endif +#if LIBYUV_VERSION >= 238 + int has_arm = TestCpuFlag(kCpuHasARM); + printf("Has ARM %d\n", has_arm); #endif int has_neon = TestCpuFlag(kCpuHasNEON); printf("Has NEON %d\n", has_neon);