From 00b69a2fe66183be5f72cb80c59f22e137b45359 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Fri, 2 Nov 2012 06:03:28 +0000
Subject: [PATCH] I400ToARGB_Neon optimized BUG=none TEST=none Review URL:
 https://webrtc-codereview.appspot.com/935010

git-svn-id: http://libyuv.googlecode.com/svn/trunk@465 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |  2 +-
 include/libyuv/row.h     | 55 ++++++++++++--------------------------
 include/libyuv/version.h |  2 +-
 source/convert_argb.cc   | 20 ++++++++++----
 source/row_any.cc        |  5 ++++
 source/row_neon.cc       | 57 ++++++++++++++++++++++++++++++++++++++++
 source/row_posix.cc      | 31 ++++++++++++++++++++++
 source/row_win.cc        | 30 +++++++++++++++++++++
 8 files changed, 157 insertions(+), 45 deletions(-)

diff --git a/README.chromium b/README.chromium
index 96070ab5a..854e32785 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 464
+Version: 465
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index a7824ee16..4c2026583 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -144,6 +144,7 @@ extern "C" {
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_ARGBTOBAYERROW_NEON
 #define HAS_ARGBTORAWROW_NEON
+#define HAS_I400TOARGBROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGBAROW_NEON
 #define HAS_BGRATOARGBROW_NEON
@@ -450,31 +451,31 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 
 void I444ToARGBRow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* argb_buf,
                      int width);
-
 void I422ToARGBRow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* argb_buf,
                      int width);
-
 void I411ToARGBRow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* rgb_buf,
                      int width);
-
 void NV12ToARGBRow_C(const uint8* y_buf,
                      const uint8* uv_buf,
                      uint8* argb_buf,
                      int width);
-
 void NV21ToRGB565Row_C(const uint8* y_buf,
                        const uint8* vu_buf,
                        uint8* argb_buf,
@@ -483,24 +484,20 @@ void NV12ToRGB565Row_C(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* argb_buf,
                        int width);
-
 void NV21ToARGBRow_C(const uint8* y_buf,
                      const uint8* vu_buf,
                      uint8* argb_buf,
                      int width);
-
 void I422ToBGRARow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* bgra_buf,
                      int width);
-
 void I422ToABGRRow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* abgr_buf,
                      int width);
-
 void I422ToRGBARow_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
@@ -531,7 +528,6 @@ void I422ToRGB565Row_C(const uint8* y_buf,
                        const uint8* v_buf,
                        uint8* dst_rgb565,
                        int width);
-
 void YToARGBRow_C(const uint8* y_buf,
                   uint8* rgb_buf,
                   int width);
@@ -541,51 +537,42 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* v_buf,
                          uint8* argb_buf,
                          int width);
-
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* argb_buf,
                          int width);
-
 void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* rgb_buf,
                          int width);
-
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
                          uint8* argb_buf,
                          int width);
-
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* vu_buf,
                          uint8* argb_buf,
                          int width);
-
 void NV12ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* uv_buf,
                            uint8* argb_buf,
                            int width);
-
 void NV21ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* vu_buf,
                            uint8* argb_buf,
                            int width);
-
 void I422ToBGRARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* bgra_buf,
                          int width);
-
 void I422ToABGRRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
                          uint8* abgr_buf,
                          int width);
-
 void I422ToRGBARow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
                          const uint8* v_buf,
@@ -606,14 +593,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-
 // RGB24/RAW are unaligned.
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
                           uint8* rgb_buf,
                           int width);
-
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
@@ -719,20 +704,17 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* y_buf,
                                const uint8* v_buf,
                                uint8* rgba_buf,
                                int width);
-
 // RGB24/RAW are unaligned.
 void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf,
                               const uint8* u_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width);
-
 void I422ToRAWRow_Any_SSSE3(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width);
-
 void YToARGBRow_SSE2(const uint8* y_buf,
                      uint8* argb_buf,
                      int width);
@@ -847,24 +829,21 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
                               const uint8* v_buf,
                               uint8* rgb_buf,
                               int width);
-
 void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            int width);
-
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
 void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            int width);
-
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
 void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
-                            int width);
+                              const uint8* u_buf,
+                              const uint8* v_buf,
+                              uint8* rgb_buf,
+                              int width);
 
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 74566f814..cd1e14ce0 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 464
+#define LIBYUV_VERSION 465
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 14ab96d70..cab63d8ff 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -248,13 +248,23 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
   void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
       I400ToARGBRow_C;
 #if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    I400ToARGBRow = I400ToARGBRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+    I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I400ToARGBRow = I400ToARGBRow_SSE2;
+      }
+    }
+  }
+#elif defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    I400ToARGBRow = I400ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I400ToARGBRow = I400ToARGBRow_NEON;
+    }
   }
 #endif
-
   for (int y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
     src_y += src_stride_y;
diff --git a/source/row_any.cc b/source/row_any.cc
index 8a06202f4..bc5ea964d 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -116,6 +116,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
 // SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination.
 // SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes.
 // NEON RGB24 is multiple of 8 pixels, unaligned source and destination.
+// I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C.
 #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \
     void NAMEANY(const uint8* argb_buf,                                        \
                  uint8* rgb_buf,                                               \
@@ -136,6 +137,8 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
        3, 4, 2)
 RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
        3, 4, 2)
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
+       7, 1, 4)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
 RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
@@ -146,6 +149,8 @@ RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
        7, 4, 2)
 RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
        7, 4, 2)
+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
+       7, 1, 4)
 #endif
 #undef RGBANY
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 52783dcdb..f84d7ba47 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -24,6 +24,11 @@ extern "C" {
     "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
     "vld1.u32   {d2[1]}, [%2]!                 \n"
 
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400                                                             \
+    "vld1.u8    {d0}, [%0]!                    \n"                             \
+    "vmov.u8    d2, #128                       \n"
+
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
     "vld1.u8    {d0}, [%0]!                    \n"                             \
@@ -411,6 +416,58 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
 }
 #endif  // HAS_I422TOARGB4444ROW_NEON
 
+#ifdef HAS_YTOARGBROW_NEON
+void YToARGBRow_NEON(const uint8* src_y,
+                     uint8* dst_argb,
+                     int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%3]                    \n"
+    "vld1.u8    {d25}, [%4]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV400
+    YUV422TORGB
+    "subs       %2, %2, #8                     \n"
+    "vmov.u8    d23, #255                      \n"
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : "r"(&kUVToRB),   // %3
+      "r"(&kUVToG)     // %4
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_YTOARGBROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+                        uint8* dst_argb,
+                        int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    "vmov.u8    d23, #255                      \n"
+  "1:                                          \n"
+    "vld1.u8    {d20}, [%0]!                   \n"
+    "vmov       d21, d20                       \n"
+    "vmov       d22, d20                       \n"
+    "subs       %2, %2, #8                     \n"
+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+    "bgt        1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    :
+    : "cc", "memory", "d20", "d21", "d22", "d23"
+  );
+}
+#endif  // HAS_I400TOARGBROW_NEON
+
 #ifdef HAS_NV12TOARGBROW_NEON
 void NV12ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_uv,
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 1078ed654..fa0c07ec6 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -171,6 +171,37 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   );
 }
 
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+                                  int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "lea       0x8(%0),%0                      \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklwd %%xmm0,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "movdqu    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
diff --git a/source/row_win.cc b/source/row_win.cc
index f0001cf80..680e24935 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -131,6 +131,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
 };
 
+// Duplicates gray value 3 times and fills in alpha opaque.
 __declspec(naked) __declspec(align(16))
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   __asm {
@@ -159,6 +160,35 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   }
 }
 
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+                                  int pix) {
+  __asm {
+    mov        eax, [esp + 4]        // src_y
+    mov        edx, [esp + 8]        // dst_argb
+    mov        ecx, [esp + 12]       // pix
+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    pslld      xmm5, 24
+
+    align      16
+  convertloop:
+    movq       xmm0, qword ptr [eax]
+    lea        eax,  [eax + 8]
+    punpcklbw  xmm0, xmm0
+    movdqa     xmm1, xmm0
+    punpcklwd  xmm0, xmm0
+    punpckhwd  xmm1, xmm1
+    por        xmm0, xmm5
+    por        xmm1, xmm5
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+    ret
+  }
+}
+
 __declspec(naked) __declspec(align(16))
 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
 __asm {