From f3ad618d40900f93029933d940608e888c571b90 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 13 Feb 2013 18:38:03 +0000 Subject: [PATCH] Sum of Square Error ported to AVX2 BUG=187 TEST=compare_unittest Review URL: https://webrtc-codereview.appspot.com/1099009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/compare.cc | 43 ++++++++++++++++++++++++++++-------- source/compare_win.cc | 47 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 81 insertions(+), 13 deletions(-) diff --git a/README.chromium b/README.chromium index d08c3f14a..2263bf9c1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 571 +Version: 572 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 85d5c1667..cf4b72a5b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 571 +#define LIBYUV_VERSION 572 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index 06d9dbe6b..2dbf311b5 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -71,12 +71,19 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); -#elif !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \ +#endif +#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \ defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); #endif +// Visual C 2012 required for AVX2. +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) && _MSC_VER >= 1700 +#define HAS_SUMSQUAREERROR_AVX2 +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); +#endif +// TODO(fbarchard): Refactor into row function. LIBYUV_API uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count) { @@ -86,16 +93,24 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; } -#elif defined(HAS_SUMSQUAREERROR_SSE2) +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } #endif - // 32K values will fit a 32bit int return value from SumSquareError. - // After each block of 32K, accumulate into 64 bit int. - const int kBlockSize = 1 << 15; // 32768; +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + // Note only used for multiples of 32 so count is not checked. + SumSquareError = SumSquareError_AVX2; + } +#endif + // SumSquareError returns values 0 to 65535 for each squared difference. + // Up to 65536 of those can be summed and remain within a uint32. + // After each block of 65536 pixels, accumulate into a uint64. + const int kBlockSize = 65536; uint64 sse = 0; #ifdef _OPENMP #pragma omp parallel for reduction(+: sse) @@ -105,13 +120,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } src_a += count & ~(kBlockSize - 1); src_b += count & ~(kBlockSize - 1); - int remainder = count & (kBlockSize - 1) & ~15; + int remainder = count & (kBlockSize - 1) & ~31; if (remainder) { sse += SumSquareError(src_a, src_b, remainder); src_a += remainder; src_b += remainder; } - remainder = count & 15; + remainder = count & 31; if (remainder) { sse += SumSquareError_C(src_a, src_b, remainder); } @@ -122,20 +137,30 @@ LIBYUV_API uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, const uint8* src_b, int stride_b, int width, int height) { + + if (stride_a == width && stride_b == width) { + return ComputeSumSquareError(src_a, src_b, width * height); + } + uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; } -#elif defined(HAS_SUMSQUAREERROR_SSE2) +#endif +#if defined(HAS_SUMSQUAREERROR_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) && IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) { SumSquareError = SumSquareError_SSE2; } #endif - +#if defined(HAS_SUMSQUAREERROR_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + SumSquareError = SumSquareError_AVX2; + } +#endif uint64 sse = 0; for (int h = 0; h < height; ++h) { sse += SumSquareError(src_a, src_b, width); diff --git a/source/compare_win.cc b/source/compare_win.cc index 7fb61d7f0..1a4ad1985 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -56,6 +56,50 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { } } +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable: 4752) +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // for unpack. + sub edx, eax + + align 16 + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + sub ecx, 32 + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm4, ymm0, ymm1 + vzeroupper // TODO(fbarchard): Remove. + movd eax, xmm4 + ret + } +} +#endif // _MSC_VER >= 1700 + #define HAS_HASHDJB2_SSE41 static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 static const uvec32 kHashMul0 = { @@ -140,8 +184,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { ret } } - -#endif // _M_IX86 +#endif // !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #ifdef __cplusplus } // extern "C"