From 0e6ce93c84f710e6a589c6c6edfe480ad0567f0c Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 20 Jan 2012 00:42:00 +0000
Subject: [PATCH] address munge for rowreverse.  And computer green mask based
 on red mask to save one shift. BUG=none TEST=none Review URL:
 https://webrtc-codereview.appspot.com/363001

git-svn-id: http://libyuv.googlecode.com/svn/trunk@144 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium     |   2 +-
 source/row_posix.cc |  14 ++---
 source/row_win.cc   | 131 +++-----------------------------------------
 3 files changed, 14 insertions(+), 133 deletions(-)

diff --git a/README.chromium b/README.chromium
index d41c158e6..7e25f590c 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 143
+Version: 144
 License: BSD
 License File: LICENSE
 
diff --git a/source/row_posix.cc b/source/row_posix.cc
index d57260b2a..984281be3 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -655,14 +655,13 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
   "movdqa     %3,%%xmm5                        \n"
-  "lea        -0x10(%0,%2,1),%0                \n"
+  "lea        -0x10(%0),%0                     \n"
   "1:                                          \n"
-    "movdqa     (%0),%%xmm0                    \n"
-    "lea        -0x10(%0),%0                   \n"
+    "movdqa     (%0,%2),%%xmm0                 \n"
     "pshufb     %%xmm5,%%xmm0                  \n"
+    "sub        $0x10,%2                       \n"
     "movdqa     %%xmm0,(%1)                    \n"
     "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
     "ja         1b                             \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
@@ -681,10 +680,9 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
 void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
-  "lea        -0x10(%0,%2,1),%0                \n"
+  "lea        -0x10(%0),%0                     \n"
   "1:                                          \n"
-    "movdqa     (%0),%%xmm0                    \n"
-    "lea        -0x10(%0),%0                   \n"
+    "movdqa     (%0,%2),%%xmm0                 \n"
     "movdqa     %%xmm0,%%xmm1                  \n"
     "psllw      $0x8,%%xmm0                    \n"
     "psrlw      $0x8,%%xmm1                    \n"
@@ -692,9 +690,9 @@ void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
     "pshuflw    $0x1b,%%xmm0,%%xmm0            \n"
     "pshufhw    $0x1b,%%xmm0,%%xmm0            \n"
     "pshufd     $0x4e,%%xmm0,%%xmm0            \n"
+    "sub        $0x10,%2                       \n"
     "movdqa     %%xmm0,(%1)                    \n"
     "lea        0x10(%1),%1                    \n"
-    "sub        $0x10,%2                       \n"
     "ja         1b                             \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
diff --git a/source/row_win.cc b/source/row_win.cc
index f47ea4b83..90260ef11 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -238,120 +238,6 @@ __asm {
   }
 }
 
-#ifdef SHIFT565
-// Below shift/mask code is efficient and works, but more instructions than
-// pmul method
-// TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
-// 29 instructions
-__declspec(naked)
-void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                             int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 for Alpha
-    pslld     xmm5, 24
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
-    psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
-    psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc for Green
-    psrlw     xmm7, 10
-    psllw     xmm7, 2
-
- convertloop:
-    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
-    lea       eax, [eax + 16]
-    movdqa    xmm1, xmm0
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm4    // R in upper 5 bits
-    psrlw     xmm2, 13      // R 3 bits
-    psllw     xmm2, 8
-    por       xmm1, xmm2
-    movdqa    xmm2, xmm0
-    pand      xmm2, xmm6    // mask B 5 bits
-    movdqa    xmm3, xmm2
-    psllw     xmm2, 3
-    psrlw     xmm3, 2
-    por       xmm2, xmm3
-    por       xmm1, xmm2    // RB
-    psrlw     xmm0, 3       // G in top 6 bits of lower byte
-    pand      xmm0, xmm7    // mask G 6 bits
-    movdqa    xmm2, xmm0
-    psrlw     xmm2, 6
-    por       xmm0, xmm2
-    por       xmm0, xmm5   // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqa    [edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    ja        convertloop
-    ret
-  }
-}
-
-// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
-// 33 instructions
-__declspec(naked)
-void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                               int pix) {
-__asm {
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
-    mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff00ff00 for Alpha
-    psllw     xmm5, 8
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
-    psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
-    psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00f800f8 for Green
-    psrlw     xmm7, 11
-    psllw     xmm7, 3
-
- convertloop:
-    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
-    lea       eax, [eax + 16]
-    movdqa    xmm1, xmm0
-    psllw     xmm1, 1
-    movdqa    xmm2, xmm0
-    pand      xmm1, xmm4    // R in upper 5 bits
-    psrlw     xmm2, 13      // R 3 bits
-    psllw     xmm2, 8
-    por       xmm1, xmm2
-    movdqa    xmm2, xmm0
-    pand      xmm2, xmm6    // mask B 5 bits
-    movdqa    xmm3, xmm2
-    psllw     xmm2, 3
-    psrlw     xmm3, 2
-    por       xmm2, xmm3
-    por       xmm1, xmm2    // RB
-    movdqa    xmm2, xmm0
-    psrlw     xmm2, 2       // G in top 5 bits of lower byte
-    pand      xmm2, xmm7    // mask G 5 bits
-    movdqa    xmm3, xmm2
-    psrlw     xmm3, 5
-    por       xmm2, xmm3
-    psraw     xmm0, 8       // A
-    pand      xmm0, xmm5
-    por       xmm0, xmm2    // AG
-    movdqa    xmm2, xmm1
-    punpcklbw xmm1, xmm0
-    punpckhbw xmm2, xmm0
-    movdqa    [edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
-    sub       ecx, 8
-    ja        convertloop
-    ret
-  }
-}
-#endif
-
 // pmul method to replicate bits
 // Math to replicate bits
 // (v << 8) | (v << 3)
@@ -422,8 +308,7 @@ __asm {
     pshufd    xmm6, xmm6, 0
     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x03e003e0 for Green
-    psllw     xmm4, 11
+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
     psrlw     xmm4, 6
     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
@@ -1305,14 +1190,13 @@ __asm {
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, kShuffleReverse
-    lea       eax, [eax + ecx - 16]
- convertloop:
-    movdqa    xmm0, [eax]
     lea       eax, [eax - 16]
+ convertloop:
+    movdqa    xmm0, [eax + ecx]
     pshufb    xmm0, xmm5
+    sub       ecx, 16
     movdqa    [edx], xmm0
     lea       edx, [edx + 16]
-    sub       ecx, 16
     ja        convertloop
     ret
   }
@@ -1327,10 +1211,9 @@ __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax + ecx - 16]
- convertloop:
-    movdqa    xmm0, [eax]
     lea       eax, [eax - 16]
+ convertloop:
+    movdqa    xmm0, [eax + ecx]
     movdqa    xmm1, xmm0        // swap bytes
     psllw     xmm0, 8
     psrlw     xmm1, 8
@@ -1338,9 +1221,9 @@ __asm {
     pshuflw   xmm0, xmm0, 0x1b  // swap words
     pshufhw   xmm0, xmm0, 0x1b
     pshufd    xmm0, xmm0, 0x4e  // swap qwords
+    sub       ecx, 16
     movdqa    [edx], xmm0
     lea       edx, [edx + 16]
-    sub       ecx, 16
     ja        convertloop
     ret
   }