From f3ad618d40900f93029933d940608e888c571b90 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 13 Feb 2013 18:38:03 +0000
Subject: [PATCH] Sum of Square Error ported to AVX2 BUG=187
 TEST=compare_unittest Review URL:
 https://webrtc-codereview.appspot.com/1099009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@572 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |  2 +-
 include/libyuv/version.h |  2 +-
 source/compare.cc        | 43 ++++++++++++++++++++++++++++--------
 source/compare_win.cc    | 47 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/README.chromium b/README.chromium
index d08c3f14a..2263bf9c1 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 571
+Version: 572
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 85d5c1667..cf4b72a5b 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 571
+#define LIBYUV_VERSION 572
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/compare.cc b/source/compare.cc
index 06d9dbe6b..2dbf311b5 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -71,12 +71,19 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
-#elif !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
+#endif
+#if !defined(YUV_DISABLE_ASM) && (defined(_M_IX86) || \
     defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
+// Visual C 2012 required for AVX2.
+#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
 
+// TODO(fbarchard): Refactor into row function.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
                              int count) {
@@ -86,16 +93,24 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
   }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) &&
       IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
     // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }
 #endif
-  // 32K values will fit a 32bit int return value from SumSquareError.
-  // After each block of 32K, accumulate into 64 bit int.
-  const int kBlockSize = 1 << 15;  // 32768;
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    // Note only used for multiples of 32 so count is not checked.
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
+  // SumSquareError returns values 0 to 65535 for each squared difference.
+  // Up to 65536 of those can be summed and remain within a uint32.
+  // After each block of 65536 pixels, accumulate into a uint64.
+  const int kBlockSize = 65536;
   uint64 sse = 0;
 #ifdef _OPENMP
 #pragma omp parallel for reduction(+: sse)
@@ -105,13 +120,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
   }
   src_a += count & ~(kBlockSize - 1);
   src_b += count & ~(kBlockSize - 1);
-  int remainder = count & (kBlockSize - 1) & ~15;
+  int remainder = count & (kBlockSize - 1) & ~31;
   if (remainder) {
     sse += SumSquareError(src_a, src_b, remainder);
     src_a += remainder;
     src_b += remainder;
   }
-  remainder = count & 15;
+  remainder = count & 31;
   if (remainder) {
     sse += SumSquareError_C(src_a, src_b, remainder);
   }
@@ -122,20 +137,30 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                   const uint8* src_b, int stride_b,
                                   int width, int height) {
+
+  if (stride_a == width && stride_b == width) {
+    return ComputeSumSquareError(src_a, src_b, width * height);
+  }
+
   uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
       SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
   }
-#elif defined(HAS_SUMSQUAREERROR_SSE2)
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
       IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
       IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
     SumSquareError = SumSquareError_SSE2;
   }
 #endif
-
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    SumSquareError = SumSquareError_AVX2;
+  }
+#endif
   uint64 sse = 0;
   for (int h = 0; h < height; ++h) {
     sse += SumSquareError(src_a, src_b, width);
diff --git a/source/compare_win.cc b/source/compare_win.cc
index 7fb61d7f0..1a4ad1985 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -56,6 +56,50 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
   }
 }
 
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // for unpack.
+    sub        edx, eax
+
+    align      16
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    sub        ecx, 32
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm4, ymm0, ymm1
+    vzeroupper                   // TODO(fbarchard): Remove.
+    movd       eax, xmm4
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
 #define HAS_HASHDJB2_SSE41
 static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
 static const uvec32 kHashMul0 = {
@@ -140,8 +184,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     ret
   }
 }
-
-#endif  // _M_IX86
+#endif  // !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 
 #ifdef __cplusplus
 }  // extern "C"