From 67a0987dd9f4100d7dd2233f392f8b91c276380c Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 7 Nov 2013 20:35:17 +0000
Subject: [PATCH] Scale Up2 ported to NaCL. BUG=none TEST=none
 R=nfullagar@chromium.org, nfullagar@google.com

Review URL: https://webrtc-codereview.appspot.com/3589004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@846 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium                   |  2 +-
 include/libyuv/planar_functions.h |  1 +
 include/libyuv/version.h          |  2 +-
 source/scale_argb.cc              | 36 ++++++++++++++++++++++++++++---
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/README.chromium b/README.chromium
index 06c1c70d1..6a7da8db4 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 845
+Version: 846
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 6b0dae855..22650f2ad 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -355,6 +355,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // dst_cumsum table of width * height * 16 bytes aligned to 16 byte boundary.
 // dst_stride32_cumsum is number of ints in a row (width * 4).
 // radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
 int ARGBBlur(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 436574664..ad6b5253d 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 845
+#define LIBYUV_VERSION 846
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index f224761fb..21ed8bcb9 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -371,7 +371,6 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-#define HAS_SCALEARGBCOLSUP2_SSE2
 __declspec(naked) __declspec(align(16))
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int /* x */, int /* dx */) {
@@ -675,6 +674,39 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
   );
 }
 
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int /* x */, int /* dx */) {
+  asm volatile (
+    ".p2align  4                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
 static uvec8 kShuffleColARGB = {
   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
@@ -1363,14 +1395,12 @@ static void ScaleARGBSimple(int src_width, int src_height,
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
-#if defined(HAS_SCALEARGBCOLS_SSE2)
     if (src_width * 2 == dst_width && IS_ALIGNED(dst_width, 8) &&
         (x >> 16) == 0 &&
         IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
         IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
-#endif
   }
 #endif