ARGBAffineRow_SSE2 function to copy pixels from a source with slope to a row of destination.

BUG=60 TEST=none Review URL: https://webrtc-codereview.appspot.com/727004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@313 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-07-30 16:26:19 +08:00 · 2012-08-08 18:10:15 +00:00 · 2012-08-08 18:10:15 +00:00 · 864f828a01
commit 864f828a01
parent 4f10e97ff2
4 changed files with 76 additions and 3 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 312
+Version: 313
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 312
+#define LIBYUV_VERSION 313

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row.h
+++ b/source/row.h
@ -87,6 +87,7 @@ extern "C" {
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
+#define HAS_ARGBAFFINEROW_SSE2
 #endif

 // The following are disabled when SSSE3 is available:
@ -522,6 +523,8 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                    uint32 value);
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                       uint32 value);
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width);

 #ifdef __cplusplus
 }  // extern "C"
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -3347,8 +3347,78 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 }
 #endif  // HAS_ARGBSHADE_SSE2

-#endif  // _M_IX86
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+                        uint8* dst_argb, const float* uv_dudv, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 8]   // src_argb
+    mov        esi, [esp + 12]  // stride
+    mov        edx, [esp + 16]  // dst_argb
+    mov        ecx, [esp + 20]  // pointer to uv_dudv
+    movq       xmm2, qword ptr [ecx]  // uv
+    movq       xmm3, qword ptr [ecx + 8]  // dudv
+    mov        ecx, [esp + 24]  // width
+    shl        esi, 16          // 4, stride
+    add        esi, 4
+    movd       xmm4, esi
+    sub        ecx, 2
+    jl         l2b

+    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    addps      xmm0, xmm3
+    movlhps    xmm2, xmm0
+    pshufd     xmm4, xmm4, 0  // dup 4, stride
+    movlhps    xmm3, xmm3    // dudv
+    addps      xmm3, xmm3    // dudv *= 2
+    pshufd     xmm4, xmm4, 0
+
+     // 2 pixel loop
+    align      4
+  l2:
+    cvttps2dq  xmm1, xmm2    // x, y float to int
+    packssdw   xmm1, xmm1    // x, y as shorts
+    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
+    addps      xmm2, xmm3    // x, y += dx, dy
+    movd       esi, xmm1
+    movdqa     xmm5, xmm1
+    pshufd     xmm5, xmm5, 0x55
+    movd       xmm0, [eax + esi]  // read pixel 0
+    movd       esi, xmm5
+    movd       xmm5, [eax + esi]  // read pixel 1
+    punpckldq  xmm0, xmm5
+    sub        ecx, 2
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jge        l2
+
+  l2b:
+    add        ecx, 2 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    cvttps2dq  xmm1, xmm2    // x, y float to int
+    packssdw   xmm1, xmm1    // x, y as shorts
+    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
+    addps      xmm2, xmm3    // x, y += dx, dy
+    movd       esi, xmm1
+    movd       xmm0, [eax + esi]  // copy a pixel
+    sub        ecx, 1
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
+    jge        l1
+  l1b:
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBAFFINEROW_SSE2
+
+#endif  // _M_IX86

 #ifdef __cplusplus
 }  // extern "C"