From ccd6d9b2de6af7985775a2e5537190cf5794dd44 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 13 Jan 2012 19:26:50 +0000
Subject: [PATCH] ARGB1555ToARGBRow_SSE2 BUG=none TEST=media_unittest Review
 URL: http://webrtc-codereview.appspot.com/349006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@133 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium      |   2 +-
 source/row.h         |  80 ++++-------------
 source/row_common.cc |   9 ++
 source/row_win.cc    | 204 +++++++++++++++++++++++++++++++++++++++----
 4 files changed, 213 insertions(+), 82 deletions(-)

diff --git a/README.chromium b/README.chromium
index 24359ae46..7e1df26af 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 132
+Version: 133
 License: BSD
 License File: LICENSE
 
diff --git a/source/row.h b/source/row.h
index 192ab5680..0cbd7f0a7 100644
--- a/source/row.h
+++ b/source/row.h
@@ -60,8 +60,9 @@
 
 // The following are available on Windows platforms
 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
-#define HAS_ARGB4444TOARGBROW_SSE2
 #define HAS_RGB565TOARGBROW_SSE2
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
 #endif
 
 // The following are available on Neon platforms
@@ -82,64 +83,60 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_NEON
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) signed char vec8[16];
+typedef __declspec(align(16)) unsigned char uvec8[16];
+typedef __declspec(align(16)) signed short vec16[8];
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef signed char __attribute__((vector_size(16))) vec8;
+typedef unsigned char __attribute__((vector_size(16))) uvec8;
+typedef signed short __attribute__((vector_size(16))) vec16;
+#endif
+
+
 void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOBGRAROW_NEON
 void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOABGRROW_NEON
 void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORGB565ROW_NEON
 void FastConvertYUVToRGB565Row_NEON(const uint8* y_buf,
                                     const uint8* u_buf,
                                     const uint8* v_buf,
                                     uint8* rgb_buf,
                                     int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOARGB1555ROW_NEON
 void FastConvertYUVToARGB1555Row_NEON(const uint8* y_buf,
                                       const uint8* u_buf,
                                       const uint8* v_buf,
                                       uint8* rgb_buf,
                                       int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTOARGB4444ROW_NEON
 void FastConvertYUVToARGB4444Row_NEON(const uint8* y_buf,
                                       const uint8* u_buf,
                                       const uint8* v_buf,
                                       uint8* rgb_buf,
                                       int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORGB24ROW_NEON
 void FastConvertYUVToRGB24Row_NEON(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
                                    uint8* rgb_buf,
                                    int width);
-#endif
-#ifdef HAS_FASTCONVERTYUVTORAWROW_NEON
 void FastConvertYUVToRAWRow_NEON(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* rgb_buf,
                                  int width);
-#endif
 
-#ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -149,11 +146,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width);
 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width);
-#endif
-#if defined(HAS_RGB24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
-#define HASRGB24TOYROW_SSSE3
-#endif
-#ifdef HASRGB24TOYROW_SSSE3
 void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@@ -171,16 +163,9 @@ void ARGB1555ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 void ARGB4444ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                            uint8* dst_u, uint8* dst_v, int width);
 
-#endif
-#ifdef HAS_REVERSE_ROW_SSSE3
 void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_SSE2
 void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
-#endif
-#ifdef HAS_REVERSE_ROW_NEON
 void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
-#endif
 void ReverseRow_C(const uint8* src, uint8* dst, int width);
 
 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@@ -209,20 +194,14 @@ void ARGB1555ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
 void ARGB4444ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width);
 
-#ifdef HAS_RGB24TOARGBROW_SSSE3
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
 void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
-// TODO(fbarchard): SSE2 555
-//void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
-#ifdef HAS_RGB565TOARGBROW_SSE2
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
 void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
-#ifdef HAS_ARGB4444TOARGBROW_SSE2
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix);
-#endif
+
 void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
 void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
 void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
@@ -231,27 +210,9 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
 void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
 
-#ifdef HAS_I400TOARGBROW_SSE2
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-#endif
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
 
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-typedef __declspec(align(16)) signed char vec8[16];
-typedef __declspec(align(16)) unsigned char uvec8[16];
-typedef __declspec(align(16)) signed short vec16[8];
-#else // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-typedef signed char __attribute__((vector_size(16))) vec8;
-typedef unsigned char __attribute__((vector_size(16))) uvec8;
-typedef signed short __attribute__((vector_size(16))) vec16;
-#endif
-
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
-extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
-
 void FastConvertYUVToARGBRow_C(const uint8* y_buf,
                                const uint8* u_buf,
                                const uint8* v_buf,
@@ -310,7 +271,6 @@ void FastConvertYToARGBRow_C(const uint8* y_buf,
                              uint8* rgb_buf,
                              int width);
 
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
 void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
                                   const uint8* u_buf,
                                   const uint8* v_buf,
@@ -344,9 +304,7 @@ void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                 uint8* rgb_buf,
                                 int width);
-#endif
 
-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,
                                    const uint8* u_buf,
                                    const uint8* v_buf,
@@ -400,15 +358,11 @@ void FastConvertYUVToRAWRow_SSSE3(const uint8* y_buf,
                                   const uint8* v_buf,
                                   uint8* rgb_buf,
                                   int width);
-#endif
 
-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
 void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
                                 uint8* rgb_buf,
                                 int width);
 
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/row_common.cc b/source/row_common.cc
index 6a02a4b8a..4c52bef43 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -328,7 +328,11 @@ void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 
 void ARGB1555ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   SIMD_ALIGNED(uint8 row[kMaxStride]);
+#ifdef HAS_ARGB1555TOARGBROW_SSE2
+  ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
+#else
   ARGB1555ToARGBRow_C(src_argb, row, pix);
+#endif
   ARGBToYRow_SSSE3(row, dst_y, pix);
 }
 
@@ -378,8 +382,13 @@ void RGB565ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
 void ARGB1555ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
                            uint8* dst_u, uint8* dst_v, int pix) {
   SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+#ifdef HAS_ARGB1555TOARGBROW_SSE2
+  ARGB1555ToARGBRow_SSE2(src_argb, row, pix);
+  ARGB1555ToARGBRow_SSE2(src_argb + src_stride_argb, row + kMaxStride, pix);
+#else
   ARGB1555ToARGBRow_C(src_argb, row, pix);
   ARGB1555ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
+#endif
   ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
 }
 
diff --git a/source/row_win.cc b/source/row_win.cc
index e3325b959..27a9f593e 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -229,53 +229,50 @@ __asm {
   }
 }
 
+#ifdef SHIFT565
+// Below shift/mask code is efficient and works, but more instructions than
+// pmul method
 // TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc
+// 29 instructions
 __declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int pix) {
+void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                             int pix) {
 __asm {
     mov       eax, [esp + 4]   // src_rgb565
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000 for Alpha
     pslld     xmm5, 24
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
     psllw     xmm4, 11
-    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f
+    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
     psrlw     xmm6, 11
-    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc
+    pcmpeqb   xmm7, xmm7       // generate mask 0x00fc00fc for Green
     psrlw     xmm7, 10
     psllw     xmm7, 2
 
-
  convertloop:
     movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
     lea       eax, [eax + 16]
-
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
     pand      xmm1, xmm4    // R in upper 5 bits
     psrlw     xmm2, 13      // R 3 bits
     psllw     xmm2, 8
     por       xmm1, xmm2
-
     movdqa    xmm2, xmm0
     pand      xmm2, xmm6    // mask B 5 bits
     movdqa    xmm3, xmm2
     psllw     xmm2, 3
     psrlw     xmm3, 2
     por       xmm2, xmm3
-
     por       xmm1, xmm2    // RB
-
     psrlw     xmm0, 3       // G in top 6 bits of lower byte
     pand      xmm0, xmm7    // mask G 6 bits
     movdqa    xmm2, xmm0
     psrlw     xmm2, 6
     por       xmm0, xmm2
-
     por       xmm0, xmm5   // AG
-
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -288,7 +285,177 @@ __asm {
   }
 }
 
+// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
+// 33 instructions
+__declspec(naked)
+void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                               int pix) {
+__asm {
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    pcmpeqb   xmm5, xmm5       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm5, 8
+    pcmpeqb   xmm4, xmm4       // generate mask 0xf800f800 for Red
+    psllw     xmm4, 11
+    pcmpeqb   xmm6, xmm6       // generate mask 0x001f001f for Blue
+    psrlw     xmm6, 11
+    pcmpeqb   xmm7, xmm7       // generate mask 0x00f800f8 for Green
+    psrlw     xmm7, 11
+    psllw     xmm7, 3
+
+ convertloop:
+    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
+    lea       eax, [eax + 16]
+    movdqa    xmm1, xmm0
+    psllw     xmm1, 1
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm4    // R in upper 5 bits
+    psrlw     xmm2, 13      // R 3 bits
+    psllw     xmm2, 8
+    por       xmm1, xmm2
+    movdqa    xmm2, xmm0
+    pand      xmm2, xmm6    // mask B 5 bits
+    movdqa    xmm3, xmm2
+    psllw     xmm2, 3
+    psrlw     xmm3, 2
+    por       xmm2, xmm3
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    psrlw     xmm2, 2       // G in top 5 bits of lower byte
+    pand      xmm2, xmm7    // mask G 5 bits
+    movdqa    xmm3, xmm2
+    psrlw     xmm3, 5
+    por       xmm2, xmm3
+    psraw     xmm0, 8       // A
+    pand      xmm0, xmm5
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       edx, [edx + 32]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}
+#endif
+
+// pmul method to replicate bits
+// Math to replicate bits
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions
+__declspec(naked)
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+                          int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x20082008  // multiplier shift by 5 and then repeat 6 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    psllw     xmm4, 10
+    psrlw     xmm4, 5
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_rgb565
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqa    xmm0, [eax] // fetch 8 pixels of bgr565
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    pand      xmm1, xmm3    // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    pand      xmm0, xmm4    // G in middle 6 bits
+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
+    por       xmm0, xmm7    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}
+
+// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc
+// 24 instructions
+__declspec(naked)
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+                            int pix) {
+__asm {
+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
+    movd      xmm5, eax
+    pshufd    xmm5, xmm5, 0
+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
+    movd      xmm6, eax
+    pshufd    xmm6, xmm6, 0
+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    psllw     xmm3, 11
+    pcmpeqb   xmm4, xmm4       // generate mask 0x03e003e0 for Green
+    psllw     xmm4, 11
+    psrlw     xmm4, 6
+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    psllw     xmm7, 8
+
+    mov       eax, [esp + 4]   // src_argb1555
+    mov       edx, [esp + 8]   // dst_argb
+    mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
+
+ convertloop:
+    movdqa    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqa    xmm1, xmm0
+    movdqa    xmm2, xmm0
+    psllw     xmm1, 1       // R in upper 5 bits
+    psllw     xmm2, 11      // B in upper 5 bits
+    pand      xmm1, xmm3
+    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    psllw     xmm1, 8
+    por       xmm1, xmm2    // RB
+    movdqa    xmm2, xmm0
+    pand      xmm0, xmm4    // G in middle 5 bits
+    psraw     xmm2, 8       // A
+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm2, xmm7
+    por       xmm0, xmm2    // AG
+    movdqa    xmm2, xmm1
+    punpcklbw xmm1, xmm0
+    punpckhbw xmm2, xmm0
+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
+    sub       ecx, 8
+    ja        convertloop
+    ret
+  }
+}
+
 // TODO(fbarchard): Port ARGB4444ToARGBRow_SSE2 to gcc
+// 18 instructions
 __declspec(naked)
 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
@@ -301,10 +468,11 @@ __asm {
     mov       eax, [esp + 4]   // src_argb4444
     mov       edx, [esp + 8]   // dst_argb
     mov       ecx, [esp + 12]  // pix
+    sub       edx, eax
+    sub       edx, eax
 
  convertloop:
-    movdqa    xmm0, qword ptr [eax] // fetch 8 pixels of bgra4444
-    lea       eax, [eax + 16]
+    movdqa    xmm0, [eax]   // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
     pand      xmm0, xmm4    // mask low nibbles
     pand      xmm2, xmm5    // mask high nibbles
@@ -317,9 +485,9 @@ __asm {
     movdqa    xmm1, xmm0
     punpcklbw xmm0, xmm2
     punpckhbw xmm1, xmm2
-    movdqa    [edx], xmm0  // store 4 pixels of ARGB
-    movdqa    [edx + 16], xmm1  // store next 4 pixels of ARGB
-    lea       edx, [edx + 32]
+    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    lea       eax, [eax + 16]
     sub       ecx, 8
     ja        convertloop
     ret