mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Sum of Square Error ported to AVX2
BUG=187 TEST=compare_unittest Review URL: https://webrtc-codereview.appspot.com/1099009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
5f8858665b
commit
f3ad618d40
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 571
|
||||
Version: 572
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 571
|
||||
#define LIBYUV_VERSION 572
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -71,12 +71,19 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
|
||||
#endif
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
|
||||
defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) && _MSC_VER >= 1700
|
||||
#define HAS_SUMSQUAREERROR_AVX2
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Refactor into row function.
|
||||
LIBYUV_API
|
||||
uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
@ -86,16 +93,24 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#elif defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
|
||||
// Note only used for multiples of 16 so count is not checked.
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
// 32K values will fit a 32bit int return value from SumSquareError.
|
||||
// After each block of 32K, accumulate into 64 bit int.
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
#if defined(HAS_SUMSQUAREERROR_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// Note only used for multiples of 32 so count is not checked.
|
||||
SumSquareError = SumSquareError_AVX2;
|
||||
}
|
||||
#endif
|
||||
// SumSquareError returns values 0 to 65535 for each squared difference.
|
||||
// Up to 65536 of those can be summed and remain within a uint32.
|
||||
// After each block of 65536 pixels, accumulate into a uint64.
|
||||
const int kBlockSize = 65536;
|
||||
uint64 sse = 0;
|
||||
#ifdef _OPENMP
|
||||
#pragma omp parallel for reduction(+: sse)
|
||||
@ -105,13 +120,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
|
||||
}
|
||||
src_a += count & ~(kBlockSize - 1);
|
||||
src_b += count & ~(kBlockSize - 1);
|
||||
int remainder = count & (kBlockSize - 1) & ~15;
|
||||
int remainder = count & (kBlockSize - 1) & ~31;
|
||||
if (remainder) {
|
||||
sse += SumSquareError(src_a, src_b, remainder);
|
||||
src_a += remainder;
|
||||
src_b += remainder;
|
||||
}
|
||||
remainder = count & 15;
|
||||
remainder = count & 31;
|
||||
if (remainder) {
|
||||
sse += SumSquareError_C(src_a, src_b, remainder);
|
||||
}
|
||||
@ -122,20 +137,30 @@ LIBYUV_API
|
||||
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b,
|
||||
int width, int height) {
|
||||
|
||||
if (stride_a == width && stride_b == width) {
|
||||
return ComputeSumSquareError(src_a, src_b, width * height);
|
||||
}
|
||||
|
||||
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
|
||||
SumSquareError_C;
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#elif defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
#endif
|
||||
#if defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
|
||||
IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_SUMSQUAREERROR_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
|
||||
SumSquareError = SumSquareError_AVX2;
|
||||
}
|
||||
#endif
|
||||
uint64 sse = 0;
|
||||
for (int h = 0; h < height; ++h) {
|
||||
sse += SumSquareError(src_a, src_b, width);
|
||||
|
||||
@ -56,6 +56,50 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
}
|
||||
}
|
||||
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
|
||||
#pragma warning(disable: 4752)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_a
|
||||
mov edx, [esp + 8] // src_b
|
||||
mov ecx, [esp + 12] // count
|
||||
vpxor ymm0, ymm0, ymm0 // sum
|
||||
vpxor ymm5, ymm5, ymm5 // for unpack.
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + edx]
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 32
|
||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||
vpsubusb ymm2, ymm2, ymm1
|
||||
vpor ymm1, ymm2, ymm3
|
||||
vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
|
||||
vpunpckhbw ymm1, ymm1, ymm5
|
||||
vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
|
||||
vpmaddwd ymm1, ymm1, ymm1
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm0, ymm0, ymm2
|
||||
jg wloop
|
||||
|
||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpermq ymm1, ymm0, 0x02 // high + low lane.
|
||||
vpaddd ymm4, ymm0, ymm1
|
||||
vzeroupper // TODO(fbarchard): Remove.
|
||||
movd eax, xmm4
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // _MSC_VER >= 1700
|
||||
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||
static const uvec32 kHashMul0 = {
|
||||
@ -140,8 +184,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // _M_IX86
|
||||
#endif // !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user