From 64961c01b200a77b4af9629bf1215358ec056f0a Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 19 Sep 2012 20:03:20 +0000
Subject: [PATCH] ARGBToRGBA_NEON and ARGBToRGB24_NEON BUG=68 TEST=none Review
 URL: https://webrtc-codereview.appspot.com/816004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@367 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium            |  2 +-
 include/libyuv/row.h       |  7 +++++++
 include/libyuv/version.h   |  2 +-
 source/convert.cc          | 18 ++++++++++++++++++
 source/convert_from.cc     | 11 +++++++++++
 source/planar_functions.cc | 16 ++++++++++++++++
 source/row_common.cc       |  3 +++
 source/row_neon.cc         | 37 +++++++++++++++++++++++++++++++++++++
 8 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/README.chromium b/README.chromium
index b7d3b99cf..2cdd54f36 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 366
+Version: 367
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index ced0e3201..d2ac3d030 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -115,6 +115,8 @@ extern "C" {
 #define HAS_I422TOBGRAROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TORGBAROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER)
@@ -256,6 +258,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -472,6 +477,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d6e2cb42e..da61880cf 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 366
+#define LIBYUV_VERSION 367
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert.cc b/source/convert.cc
index b23afdade..1e24d0642 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -744,6 +744,24 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
       }
     }
   }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 16) {
+      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUVRow = YUY2ToUVRow_Unaligned_NEON;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_NEON;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUVRow = YUY2ToUVRow_NEON;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_NEON;
+        }
+      }
+    }
+  }
 #endif
   for (int y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 0abbd6855..3ab7bc631 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -927,6 +927,17 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  // TODO(fbarchard): One step I420ToRGB24Row_NEON.
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     I422ToARGBRow(src_y, src_u, src_v, row, width);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 54ff4614b..e8146e571 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -320,6 +320,12 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
     ARGBToRGBARow = ARGBToRGBARow_SSSE3;
   }
 #endif
+#if defined(HAS_ARGBTORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) &&
+      IS_ALIGNED(width, 16)) {
+    ARGBToRGBARow = ARGBToRGBARow_NEON;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBToRGBARow(src_argb, dst_rgba, width);
@@ -355,6 +361,16 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width * 3 <= kMaxStride) {
+      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+    }
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
diff --git a/source/row_common.cc b/source/row_common.cc
index b84bc142d..337ecfd04 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -979,6 +979,9 @@ RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2)
 RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2)
 RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3)
+#endif
 #undef RGBANY
 
 #ifdef HAS_ARGBTOYROW_SSSE3
diff --git a/source/row_neon.cc b/source/row_neon.cc
index a50cc2cf6..c6d4ed28a 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -18,6 +18,8 @@ extern "C" {
 // This module is for GCC Neon
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 
+// TODO(fbarchard): Make a fetch macro so different subsamples can be done.
+// TODO(fbarchard): Rework register usage to produce RGB in d21 - d23.
 #define YUV422TORGB                                                            \
     "vld1.u8    {d0}, [%0]!                    \n"                             \
     "vld1.u32   {d2[0]}, [%1]!                 \n"                             \
@@ -358,6 +360,41 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
 }
 #endif  // HAS_MIRRORROWUV_NEON
 
+#ifdef HAS_ARGBTORGBAROW_NEON
+void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vmov.u8    q0, q4                         \n"
+    "vst4.u8    {q0,q1,q2,q3}, [%1]!           \n"  // store 16 pixels of RGBA.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_rgba),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q4" // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGBAROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.u8    {q1,q2,q3,q4}, [%0]!           \n"  // load 16 pixels of ARGB.
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+    "vst3.u8    {q1,q2,q3}, [%1]!              \n"  // store 16 pixels of RGB24.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_rgb24),  // %1
+    "+r"(pix)         // %2
+  :
+  : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List
+  );
+}
+#endif  // HAS_ARGBTORGB24ROW_NEON
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus