Alpha blend test alignment of source pointer and use movdqa aligned fetches.

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/714010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@321 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-07 17:26:49 +08:00 · 2012-08-15 00:51:24 +00:00 · 2012-08-15 00:51:24 +00:00 · f877e71995
commit f877e71995
parent e3cc76943e
4 changed files with 65 additions and 3 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 320
+Version: 321
 License: BSD
 License File: LICENSE
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 320
+#define LIBYUV_VERSION 321
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -2540,9 +2540,40 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
  "19:                                         \n"
    "add       $1-4,%3                         \n"
    "jl        49f                             \n"
    "test      $0xf,%0                         \n"
    "jne       41f                             \n"
    "test      $0xf,%1                         \n"
    "jne       41f                             \n"
    // 4 pixel loop.
    ".p2align  2                               \n"
  "40:                                         \n"
    "movdqu    (%0),%%xmm3                     \n"
    "lea       0x10(%0),%0                     \n"
    "movdqa    %%xmm3,%%xmm0                   \n"
    "pxor      %%xmm4,%%xmm3                   \n"
    "movdqu    (%1),%%xmm2                     \n"
    "pshufb    %4,%%xmm3                       \n"
    "pand      %%xmm6,%%xmm2                   \n"
    "paddw     %%xmm7,%%xmm3                   \n"
    "pmullw    %%xmm3,%%xmm2                   \n"
    "movdqu    (%1),%%xmm1                     \n"
    "lea       0x10(%1),%1                     \n"
    "psrlw     $0x8,%%xmm1                     \n"
    "por       %%xmm4,%%xmm0                   \n"
    "pmullw    %%xmm3,%%xmm1                   \n"
    "psrlw     $0x8,%%xmm2                     \n"
    "paddusb   %%xmm2,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
    "paddusb   %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%3                         \n"
    "movdqa    %%xmm0,(%2)                     \n"
    "lea       0x10(%2),%2                     \n"
    "jge       40b                             \n"
    "jmp       49f                             \n"
    // 4 pixel unaligned loop.
    ".p2align  2                               \n"
  "41:                                         \n"
    "movdqu    (%0),%%xmm3                     \n"
    "lea       0x10(%0),%0                     \n"
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -2636,8 +2636,39 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    add        ecx, 1 - 4
    jl         convertloop4b
    test       eax, 15          // unaligned?
    jne        convertuloop4
    test       esi, 15          // unaligned?
    jne        convertuloop4
    // 4 pixel loop.
  convertloop4:
    movdqa     xmm3, [eax]      // src argb
    lea        eax, [eax + 16]
    movdqa     xmm0, xmm3       // src argb
    pxor       xmm3, xmm4       // ~alpha
    movdqa     xmm2, [esi]      // _r_b
    pshufb     xmm3, kShuffleAlpha // alpha
    pand       xmm2, xmm6       // _r_b
    paddw      xmm3, xmm7       // 256 - alpha
    pmullw     xmm2, xmm3       // _r_b * alpha
    movdqa     xmm1, [esi]      // _a_g
    lea        esi, [esi + 16]
    psrlw      xmm1, 8          // _a_g
    por        xmm0, xmm4       // set alpha to 255
    pmullw     xmm1, xmm3       // _a_g * alpha
    psrlw      xmm2, 8          // _r_b convert to 8 bits again
    paddusb    xmm0, xmm2       // + src argb
    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
    paddusb    xmm0, xmm1       // + src argb
    sub        ecx, 4
    movdqa     [edx], xmm0
    lea        edx, [edx + 16]
    jge        convertloop4
    jmp        convertloop4b
    // 4 pixel unaligned loop.
  convertuloop4:
    movdqu     xmm3, [eax]      // src argb
    lea        eax, [eax + 16]
    movdqa     xmm0, xmm3       // src argb
@ -2659,7 +2690,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
    sub        ecx, 4
    movdqa     [edx], xmm0
    lea        edx, [edx + 16]
-    jge        convertloop4
+    jge        convertuloop4
  convertloop4b:
    add        ecx, 4 - 1