From e9a27403d4a681ff92860044927a159bc60a8976 Mon Sep 17 00:00:00 2001
From: "nfullagar@google.com"
 <nfullagar@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 9 Aug 2013 18:21:57 +0000
Subject: [PATCH] First go at porting blur component to NaCl BUG=253 TEST=ncvar
 R=fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/1995004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@754 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |  2 +-
 include/libyuv/row.h     |  1 +
 include/libyuv/version.h |  2 +-
 source/row_posix.cc      | 76 +++++++++++++++++++++++++---------------
 4 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/README.chromium b/README.chromium
index e2efe093d..be320d70e 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 753
+Version: 754
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 6cb41741f..e0211992f 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -51,6 +51,7 @@ extern "C" {
 #define HAS_ARGBSHADEROW_SSE2
 #define HAS_ARGBSUBTRACTROW_SSE2
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 // Conversions:
 #define HAS_ARGBTOBAYERROW_SSSE3
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 780748088..de3df3fc4 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 753
+#define LIBYUV_VERSION 754
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/row_posix.cc b/source/row_posix.cc
index d03cf40fe..03c7b268c 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -20,6 +20,11 @@ extern "C" {
 // This module is for GCC x86 and x64
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG.
+
 #if defined(__native_client__) && defined(__x86_64__)
 #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
 #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
@@ -28,6 +33,10 @@ extern "C" {
     #offset "(%q" #base ",%q" #index "," #scale ")"
 #define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
 #define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg " \n"
+#define BUNDLEALIGN ".p2align 5 \n"
 #else
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMACCESS2(offset, base) #offset "(%" #base ")"
@@ -36,6 +45,9 @@ extern "C" {
     #offset "(%" #base ",%" #index "," #scale ")"
 #define MEMMOVESTRING(s, d)
 #define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg " \n"
+#define BUNDLEALIGN
 #endif
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
@@ -4635,25 +4647,28 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
   // 4 pixel loop                              \n"
     ".p2align  2                               \n"
+    BUNDLEALIGN
   "40:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "psubd     0x10(%0,%4,4),%%xmm1            \n"
-    "psubd     0x20(%0,%4,4),%%xmm2            \n"
-    "psubd     0x30(%0,%4,4),%%xmm3            \n"
-    "lea       0x40(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "psubd     0x10(%1),%%xmm1                 \n"
-    "psubd     0x20(%1),%%xmm2                 \n"
-    "psubd     0x30(%1),%%xmm3                 \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "paddd     0x10(%1,%4,4),%%xmm1            \n"
-    "paddd     0x20(%1,%4,4),%%xmm2            \n"
-    "paddd     0x30(%1,%4,4),%%xmm3            \n"
-    "lea       0x40(%1),%1                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
+    "movdqa    "MEMACCESS2(0x20,0)",%%xmm2     \n"
+    "movdqa    "MEMACCESS2(0x30,0)",%%xmm3     \n"
+    BUNDLEALIGN
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       "MEMLEA(0x40,0)",%0             \n"
+    "psubd     "MEMACCESS(1)",%%xmm0           \n"
+    "psubd     "MEMACCESS2(0x10,1)",%%xmm1     \n"
+    "psubd     "MEMACCESS2(0x20,1)",%%xmm2     \n"
+    "psubd     "MEMACCESS2(0x30,1)",%%xmm3     \n"
+    BUNDLEALIGN
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       "MEMLEA(0x40,0)",%0             \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
     "mulps     %%xmm4,%%xmm0                   \n"
@@ -4669,8 +4684,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     "packssdw  %%xmm1,%%xmm0                   \n"
     "packssdw  %%xmm3,%%xmm2                   \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
+    "movdqu    %%xmm0,"MEMACCESS(2)"           \n"
+    "lea       "MEMLEA(0x10,2)",%2             \n"
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
 
@@ -4680,20 +4695,22 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
   // 1 pixel loop                              \n"
     ".p2align  2                               \n"
+    BUNDLEALIGN
   "10:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "lea       0x10(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    "MEMACCESS(0)",%%xmm0           \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       "MEMLEA(0x10,0)",%0             \n"
+    "psubd     "MEMACCESS(1)",%%xmm0           \n"
+    BUNDLEALIGN
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       "MEMLEA(0x10,1)",%1             \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
     "mulps     %%xmm4,%%xmm0                   \n"
     "cvtps2dq  %%xmm0,%%xmm0                   \n"
     "packssdw  %%xmm0,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0,"MEMACCESS(2)"           \n"
+    "lea       "MEMLEA(0x4,2)",%2              \n"
     "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
   "19:                                         \n"
@@ -4704,6 +4721,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   : "r"(static_cast<intptr_t>(width)),  // %4
     "rm"(area)     // %5
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
 #endif