From 7344440fb22b65ec4915b6084963816404edc088 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 9 Aug 2012 17:33:29 +0000
Subject: [PATCH] AffineRow for GCC. BUG=62 TEST=planar_unittest Review URL:
 https://webrtc-codereview.appspot.com/733004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@317 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 DEPS                              |  2 +-
 README.chromium                   |  2 +-
 include/libyuv/planar_functions.h |  8 +++-
 include/libyuv/version.h          |  2 +-
 source/row.h                      |  2 +-
 source/row_posix.cc               | 76 +++++++++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/DEPS b/DEPS
index 4c2263c5d..c0cfc5dfc 100644
--- a/DEPS
+++ b/DEPS
@@ -27,7 +27,7 @@ deps = {
   # Dependencies used by libjpeg-turbo
   # Optional jpeg decoder
   "trunk/third_party/libjpeg_turbo/":
-    Var("chromium_trunk") + "/deps/third_party/libjpeg_turbo@119959",
+    Var("chromium_trunk") + "/deps/third_party/libjpeg_turbo@149334",
 
   # Yasm assember required for libjpeg_turbo
   "trunk/third_party/yasm/":
diff --git a/README.chromium b/README.chromium
index 6b77771ec..20f3afdcb 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 315
+Version: 316
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 2b2f9c307..d411966a5 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -228,11 +228,17 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     uint8* dst_argb, int dst_stride_argb,
                     int width, int height, int interpolation);
 
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
 // Row functions for copying a pixels from a source with a slope to a row
 // of destination.  Useful for scaling, rotation, mirror, texture mapping.
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                      uint8* dst_argb, const float* uv_dudv, int width);
-#if defined(_MSC_VER)
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* uv_dudv, int width);
 #define HAS_ARGBAFFINEROW_SSE2
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index ab79f711f..5220b1026 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 315
+#define LIBYUV_VERSION 316
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/row.h b/source/row.h
index c09320751..db5e7b44b 100644
--- a/source/row.h
+++ b/source/row.h
@@ -79,6 +79,7 @@ extern "C" {
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #define HAS_ARGBSHADE_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
 #endif
 
 // The following are Windows only:
@@ -87,7 +88,6 @@ extern "C" {
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
-#define HAS_ARGBAFFINEROW_SSE2
 #endif
 
 // The following are disabled when SSSE3 is available:
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 06aefb516..ee1dbc00a 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3219,6 +3219,82 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 }
 #endif  // HAS_ARGBSHADE_SSE2
 
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  asm volatile (
+    "movq      (%3),%%xmm2                     \n"
+    "movq      0x8(%3),%%xmm3                  \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm4                       \n"
+    "xor       %1,%1                           \n"  // cleanse upper bits.
+    "sub       $0x2,%4                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm3,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movlhps   %%xmm3,%%xmm3                   \n"
+    "addps     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+  // 2 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "20:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movdqa    %%xmm1,%%xmm5                   \n"
+    "pshufd    $0x55,%%xmm5,%%xmm5             \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "movd      %%xmm5,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm5                \n"
+    "punpckldq %%xmm5,%%xmm0                   \n"
+    "sub       $0x2,%4                         \n"
+    "movq      %%xmm0,(%2)                     \n"
+    "lea       0x8(%2),%2                      \n"
+    "jge       20b                             \n"
+
+  "29:                                         \n"
+    "add       $0x1,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "sub       $0x1,%4                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(uv_dudv),   // %3
+    "+rm"(width)     // %4
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus