AffineRow for GCC.

BUG=62 TEST=planar_unittest Review URL: https://webrtc-codereview.appspot.com/733004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@317 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2012-08-09 17:33:29 +00:00 · 2012-08-09 17:33:29 +00:00 · 7344440fb2
commit 7344440fb2
parent 2a95465795
6 changed files with 87 additions and 5 deletions
--- a/2
+++ b/2
@ -27,7 +27,7 @@ deps = {
  # Dependencies used by libjpeg-turbo
  # Optional jpeg decoder
  "trunk/third_party/libjpeg_turbo/":
-    Var("chromium_trunk") + "/deps/third_party/libjpeg_turbo@119959",
+    Var("chromium_trunk") + "/deps/third_party/libjpeg_turbo@149334",

  # Yasm assember required for libjpeg_turbo
  "trunk/third_party/yasm/":
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 315
+Version: 316
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -228,11 +228,17 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height, int interpolation);

+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+    defined(TARGET_IPHONE_SIMULATOR)
+#define YUV_DISABLE_ASM
+#endif
 // Row functions for copying a pixels from a source with a slope to a row
 // of destination.  Useful for scaling, rotation, mirror, texture mapping.
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
-#if defined(_MSC_VER)
+// The following are available on all x86 platforms:
+#if !defined(YUV_DISABLE_ASM) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
 #define HAS_ARGBAFFINEROW_SSE2
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 315
+#define LIBYUV_VERSION 316

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row.h
+++ b/source/row.h
@ -79,6 +79,7 @@ extern "C" {
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #define HAS_ARGBSHADE_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
 #endif

 // The following are Windows only:
@ -87,7 +88,6 @@ extern "C" {
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
-#define HAS_ARGBAFFINEROW_SSE2
 #endif

 // The following are disabled when SSSE3 is available:
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -3219,6 +3219,82 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 }
 #endif  // HAS_ARGBSHADE_SSE2

+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  intptr_t src_argb_stride_temp = src_argb_stride;
+  asm volatile (
+    "movq      (%3),%%xmm2                     \n"
+    "movq      0x8(%3),%%xmm3                  \n"
+    "shl       $0x10,%1                        \n"
+    "add       $0x4,%1                         \n"
+    "movd      %1,%%xmm4                       \n"
+    "xor       %1,%1                           \n"  // cleanse upper bits.
+    "sub       $0x2,%4                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm3,%%xmm0                   \n"
+    "movlhps   %%xmm0,%%xmm2                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movlhps   %%xmm3,%%xmm3                   \n"
+    "addps     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+  // 2 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "20:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movdqa    %%xmm1,%%xmm5                   \n"
+    "pshufd    $0x55,%%xmm5,%%xmm5             \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "movd      %%xmm5,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm5                \n"
+    "punpckldq %%xmm5,%%xmm0                   \n"
+    "sub       $0x2,%4                         \n"
+    "movq      %%xmm0,(%2)                     \n"
+    "lea       0x8(%2),%2                      \n"
+    "jge       20b                             \n"
+
+  "29:                                         \n"
+    "add       $0x1,%4                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "cvttps2dq %%xmm2,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm4,%%xmm1                   \n"
+    "addps     %%xmm3,%%xmm2                   \n"
+    "movd      %%xmm1,%1                       \n"
+    "and       $0x0fffffff,%1                  \n"
+    "movd      (%0,%1,1),%%xmm0                \n"
+    "sub       $0x1,%4                         \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(src_argb),  // %0
+    "+r"(src_argb_stride_temp),  // %1
+    "+r"(dst_argb),  // %2
+    "+r"(uv_dudv),   // %3
+    "+rm"(width)     // %4
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus