ARGBAffineRow_SSE2 function to copy pixels from a source with slope to a row of destination.

BUG=60 TEST=none Review URL: https://webrtc-codereview.appspot.com/727004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@313 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-08-08 18:10:15 +00:00 · 2012-08-08 18:10:15 +00:00 · 864f828a01
commit 864f828a01
parent 4f10e97ff2
4 changed files with 76 additions and 3 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 312
+Version: 313
 License: BSD
 License File: LICENSE
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 312
+#define LIBYUV_VERSION 313
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row.h
+++ b/source/row.h
@ -87,6 +87,7 @@ extern "C" {
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
 #define HAS_ARGBAFFINEROW_SSE2
 #endif
 // The following are disabled when SSSE3 is available:
@ -522,6 +523,8 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                    uint32 value);
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                       uint32 value);
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width);
 #ifdef __cplusplus
 }  // extern "C"
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -3347,8 +3347,78 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 }
 #endif  // HAS_ARGBSHADE_SSE2
-#endif  // _M_IX86
+#ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
 __declspec(naked) __declspec(align(16))
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                        uint8* dst_argb, const float* uv_dudv, int width) {
  __asm {
    push       esi
    mov        eax, [esp + 8]   // src_argb
    mov        esi, [esp + 12]  // stride
    mov        edx, [esp + 16]  // dst_argb
    mov        ecx, [esp + 20]  // pointer to uv_dudv
    movq       xmm2, qword ptr [ecx]  // uv
    movq       xmm3, qword ptr [ecx + 8]  // dudv
    mov        ecx, [esp + 24]  // width
    shl        esi, 16          // 4, stride
    add        esi, 4
    movd       xmm4, esi
    sub        ecx, 2
    jl         l2b
    movdqa     xmm0, xmm2    // x0, y0, x1, y1
    addps      xmm0, xmm3
    movlhps    xmm2, xmm0
    pshufd     xmm4, xmm4, 0  // dup 4, stride
    movlhps    xmm3, xmm3    // dudv
    addps      xmm3, xmm3    // dudv *= 2
    pshufd     xmm4, xmm4, 0
     // 2 pixel loop
    align      4
  l2:
    cvttps2dq  xmm1, xmm2    // x, y float to int
    packssdw   xmm1, xmm1    // x, y as shorts
    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
    addps      xmm2, xmm3    // x, y += dx, dy
    movd       esi, xmm1
    movdqa     xmm5, xmm1
    pshufd     xmm5, xmm5, 0x55
    movd       xmm0, [eax + esi]  // read pixel 0
    movd       esi, xmm5
    movd       xmm5, [eax + esi]  // read pixel 1
    punpckldq  xmm0, xmm5
    sub        ecx, 2
    movq       qword ptr [edx], xmm0
    lea        edx, [edx + 8]
    jge        l2
  l2b:
    add        ecx, 2 - 1
    jl         l1b
    // 1 pixel loop
    align      4
  l1:
    cvttps2dq  xmm1, xmm2    // x, y float to int
    packssdw   xmm1, xmm1    // x, y as shorts
    pmaddwd    xmm1, xmm4    // offset = x * 4 + y * stride
    addps      xmm2, xmm3    // x, y += dx, dy
    movd       esi, xmm1
    movd       xmm0, [eax + esi]  // copy a pixel
    sub        ecx, 1
    movd       [edx], xmm0
    lea        edx, [edx + 4]
    jge        l1
  l1b:
    pop        esi
    ret
  }
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 #endif  // _M_IX86
 #ifdef __cplusplus
 }  // extern "C"