From 9245317e1687744b50f653d631bd808a00314041 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com" <fbarchard@google.com>
Date: Wed, 4 Mar 2015 00:00:50 +0000
Subject: [PATCH] ARGBToRGB565 SSE2 port. BUG=407 TESTED=ARGBToRGB565Dither
 unittest R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/41039004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1308 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium             |  2 +-
 include/libyuv/row.h        |  7 +++
 include/libyuv/version.h    |  2 +-
 source/convert_from_argb.cc | 30 +++++++-----
 source/row_any.cc           | 16 +++++++
 source/row_win.cc           | 95 +++++++++++++++++++++++++++++++------
 6 files changed, 124 insertions(+), 28 deletions(-)

diff --git a/README.chromium b/README.chromium
index e16953bea..ccf25feea 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1306
+Version: 1307
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 80e844bae..8c6feda3a 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -198,6 +198,7 @@ extern "C" {
 #define HAS_I422TORGB565ROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
+#define HAS_ARGBTORGB565DITHERROW_SSE2
 #endif
 
 // The following are available on all x86 platforms, but
@@ -905,6 +906,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint8* dither8x8, int pix);
+
 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1375,6 +1379,9 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                    const uint8* dither8x8, int pix);
+
 void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 2c996980c..04624d0a1 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1306
+#define LIBYUV_VERSION 1307
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index dc2186a6a..ce5d97e1c 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -804,15 +804,16 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-static const uint8 kDither8x8[64] = {
-  0, 128, 32, 160,  8, 136, 40, 168,
-  192, 64, 224, 96, 200, 72, 232, 104,
-  48, 176, 16, 144, 56, 184, 24, 152,
-  240, 112, 208, 80, 248, 120, 216, 88,
-  12, 140, 44, 172,  4, 132, 36, 164,
-  204, 76, 236, 108, 196, 68, 228, 100,
-  60, 188, 28, 156, 52, 180, 20, 148,
-  252, 124, 220, 92, 244, 116, 212, 84,
+// Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
+static const uint8 kDither565_8x8[64] = {
+  0 >> 5, 128 >> 5, 32 >> 5, 160 >> 5,  8 >> 5, 136 >> 5, 40 >> 5, 168 >> 5,
+  192 >> 5, 64 >> 5, 224 >> 5, 96 >> 5, 200 >> 5, 72 >> 5, 232 >> 5, 104 >> 5,
+  48 >> 5, 176 >> 5, 16 >> 5, 144 >> 5, 56 >> 5, 184 >> 5, 24 >> 5, 152 >> 5,
+  240 >> 5, 112 >> 5, 208 >> 5, 80 >> 5, 248 >> 5, 120 >> 5, 216 >> 5, 88 >> 5,
+  12 >> 5, 140 >> 5, 44 >> 5, 172 >> 5,  4 >> 5, 132 >> 5, 36 >> 5, 164 >> 5,
+  204 >> 5, 76 >> 5, 236 >> 5, 108 >> 5, 196 >> 5, 68 >> 5, 228 >> 5, 100 >> 5,
+  60 >> 5, 188 >> 5, 28 >> 5, 156 >> 5, 52 >> 5, 180 >> 5, 20 >> 5, 148 >> 5,
+  252 >> 5, 124 >> 5, 220 >> 5, 92 >> 5, 244 >> 5, 116 >> 5, 212 >> 5, 84 >> 5,
 };
 
 // Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
@@ -832,9 +833,16 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   if (!dither8x8) {
-    dither8x8 = kDither8x8;
-
+    dither8x8 = kDither565_8x8;
   }
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2;
+    }
+  }
+#endif
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
                           dither8x8 + ((y & 7) << 3), width);
diff --git a/source/row_any.cc b/source/row_any.cc
index 19340b3b7..631b09a4c 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -225,6 +225,22 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7)
 #endif
 #undef RGBANY
 
+#define RGBDANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)         \
+    void NAMEANY(const uint8* src, uint8* dst,                                 \
+                 const uint8* dither8x8, int width) {                          \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ARGBTORGB_SIMD(src, dst, dither8x8, n);                                \
+      }                                                                        \
+      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, dither8x8, width & MASK);     \
+    }
+
+#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
+RGBDANY(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
+        ARGBToRGB565DitherRow_C, 4, 2, 7)
+#endif
+#undef RGBDANY
+
 // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
 #define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)        \
     void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) {   \
diff --git a/source/row_win.cc b/source/row_win.cc
index 5c06b6078..e6796e407 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -585,6 +585,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
+// 4 pixels
 __declspec(naked) __declspec(align(16))
 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   __asm {
@@ -622,6 +623,70 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
+// 8 pixels
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
+                                const uint8* dither8, int pix) {
+  __asm {
+    mov       eax, [esp + 12]  // dither8
+    movq      xmm6, qword ptr [eax]      // fetch 8 dither values
+    punpcklbw xmm6, xmm6
+    movdqa    xmm7, xmm6
+    punpcklwd xmm6, xmm6
+    punpckhwd xmm7, xmm7
+
+    mov       eax, [esp + 4]   // src_argb
+    mov       edx, [esp + 8]   // dst_rgb
+    mov       ecx, [esp + 16]  // pix
+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    psrld     xmm3, 27
+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    psrld     xmm4, 26
+    pslld     xmm4, 5
+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pslld     xmm5, 11
+
+ convertloop:
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
+
+    movdqu    xmm0, [eax + 16]   // fetch 4 pixels of argb
+    paddusb   xmm0, xmm7
+    movdqa    xmm1, xmm0    // B
+    movdqa    xmm2, xmm0    // G
+    pslld     xmm0, 8       // R
+    psrld     xmm1, 3       // B
+    psrld     xmm2, 5       // G
+    psrad     xmm0, 16      // R
+    pand      xmm1, xmm3    // B
+    pand      xmm2, xmm4    // G
+    pand      xmm0, xmm5    // R
+    por       xmm1, xmm2    // BG
+    por       xmm0, xmm1    // BGR
+    packssdw  xmm0, xmm0
+    movq      qword ptr [edx + 8], xmm0  // store 4 pixels of RGB565
+
+    lea       eax, [eax + 32]
+    lea       edx, [edx + 16]
+    sub       ecx, 8
+    jg        convertloop
+    ret
+  }
+}
+
 // TODO(fbarchard): Improve sign extension/packing.
 __declspec(naked) __declspec(align(16))
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
@@ -1646,8 +1711,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
     __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
     __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
-    __asm vmovdqu    [edx], ymm1                                               \
-    __asm vmovdqu    [edx + 32], ymm0                                          \
+    __asm vmovdqu    0[edx], ymm1                                              \
+    __asm vmovdqu    32[edx], ymm0                                             \
     __asm lea        edx,  [edx + 64]                                          \
   }
 
@@ -1959,8 +2024,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movdqa     xmm1, xmm0                                                \
     __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
     __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
-    __asm movdqu     [edx], xmm0                                               \
-    __asm movdqu     [edx + 16], xmm1                                          \
+    __asm movdqu     0[edx], xmm0                                              \
+    __asm movdqu     16[edx], xmm1                                             \
     __asm lea        edx,  [edx + 32]                                          \
   }
 
@@ -1973,8 +2038,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movdqa     xmm0, xmm5                                                \
     __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
     __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
-    __asm movdqu     [edx], xmm5                                               \
-    __asm movdqu     [edx + 16], xmm0                                          \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]                                          \
   }
 
@@ -1986,8 +2051,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movdqa     xmm1, xmm2                                                \
     __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
     __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
-    __asm movdqu     [edx], xmm2                                               \
-    __asm movdqu     [edx + 16], xmm1                                          \
+    __asm movdqu     0[edx], xmm2                                              \
+    __asm movdqu     16[edx], xmm1                                             \
     __asm lea        edx,  [edx + 32]                                          \
   }
 
@@ -2000,8 +2065,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movdqa     xmm0, xmm5                                                \
     __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
     __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
-    __asm movdqu     [edx], xmm5                                               \
-    __asm movdqu     [edx + 16], xmm0                                          \
+    __asm movdqu     0[edx], xmm5                                              \
+    __asm movdqu     16[edx], xmm0                                             \
     __asm lea        edx,  [edx + 32]                                          \
   }
 
@@ -2017,8 +2082,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
-    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
     __asm lea        edx,  [edx + 24]                                          \
   }
 
@@ -2034,8 +2099,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
     __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
     __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
-    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
+    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
     __asm lea        edx,  [edx + 24]                                          \
   }
 
@@ -2071,7 +2136,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm por        xmm3, xmm2    /* BG */                                    \
     __asm por        xmm1, xmm3    /* BGR */                                   \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     [edx], xmm0   /* store 8 pixels of RGB565 */              \
+    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
     __asm lea        edx, [edx + 16]                                           \
   }