From c56a55fc7206a257eecc21969f94ab066dd80f2f Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 4 Apr 2013 18:33:44 +0000
Subject: [PATCH] Sobel and SobelXY Neon port.  Improved Bayer - did 8 at time
 version, and specialized G channel version. BUG=201 TEST=libyuvTest.TestSobel
 and libyuvTest.TestSobelXY Review URL:
 https://webrtc-codereview.appspot.com/1279006

git-svn-id: http://libyuv.googlecode.com/svn/trunk@642 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium             |   2 +-
 include/libyuv/row.h        |  35 ++++++---
 include/libyuv/version.h    |   2 +-
 source/format_conversion.cc |   8 +-
 source/planar_functions.cc  |  38 +++++++++-
 source/row_any.cc           |   2 +-
 source/row_neon.cc          | 142 ++++++++++++++++++++++++++++++++++--
 unit_test/planar_test.cc    |  20 +++++
 8 files changed, 222 insertions(+), 27 deletions(-)

diff --git a/README.chromium b/README.chromium
index 816d70514..e263c2cf6 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 641
+Version: 642
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 84bbbba46..fc31c8eac 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -197,6 +197,7 @@ extern "C" {
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
@@ -269,6 +270,10 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
 #endif
 
 // The following are available on Mips platforms
@@ -1315,16 +1320,18 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix);
 
-void ARGBToBayerRow_C(const uint8* src_argb,
-                      uint8* dst_bayer, uint32 selector, int pix);
-void ARGBToBayerRow_SSSE3(const uint8* src_argb,
-                          uint8* dst_bayer, uint32 selector, int pix);
-void ARGBToBayerRow_NEON(const uint8* src_argb,
-                         uint8* dst_bayer, uint32 selector, int pix);
-void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_bayer, uint32 selector, int pix);
-void ARGBToBayerRow_Any_NEON(const uint8* src_argb,
-                             uint8* dst_bayer, uint32 selector, int pix);
+void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
+                      uint32 selector, int pix);
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+                          uint32 selector, int pix);
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+                         uint32 selector, int pix);
+void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+                              uint32 selector, int pix);
+void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+                             uint32 selector, int pix);
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 /* selector */, int pix);
 
 void I422ToYUY2Row_C(const uint8* src_y,
                      const uint8* src_u,
@@ -1459,18 +1466,26 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
                  uint8* dst_sobelx, int width);
 void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
                      const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
 void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
                  uint8* dst_sobely, int width);
 void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
                      uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
 void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
                 uint8* dst_argb, int width);
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width);
 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
                   uint8* dst_argb, int width);
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 93d8adda0..933031adc 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 641
+#define LIBYUV_VERSION 642
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/format_conversion.cc b/source/format_conversion.cc
index 53955f715..d2e773f3d 100644
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@@ -80,9 +80,9 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
     }
   }
 #elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerRow_NEON;
     }
   }
@@ -437,9 +437,9 @@ int I420ToBayer(const uint8* src_y, int src_stride_y,
     }
   }
 #elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerRow_NEON;
     }
   }
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 77af629a1..72dff8b65 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1769,9 +1769,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
     }
   }
 #elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerRow_NEON;
     }
   }
@@ -1782,6 +1782,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelYRow = SobelYRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
 #endif
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobely, int width) =
@@ -1790,6 +1795,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelXRow = SobelXRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
 #endif
   void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) = SobelRow_C;
@@ -1799,6 +1809,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
     SobelRow = SobelRow_SSE2;
   }
 #endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    SobelRow = SobelRow_NEON;
+  }
+#endif
 
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
   SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
@@ -1868,9 +1883,9 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 4)) {
+    if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerRow_NEON;
     }
   }
@@ -1881,6 +1896,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelYRow = SobelYRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
 #endif
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobely, int width) =
@@ -1889,6 +1909,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelXRow = SobelXRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
 #endif
   void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) = SobelXYRow_C;
@@ -1898,6 +1923,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
     SobelXYRow = SobelXYRow_SSE2;
   }
 #endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    SobelXYRow = SobelXYRow_NEON;
+  }
+#endif
 
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
   SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
diff --git a/source/row_any.cc b/source/row_any.cc
index 6c0d4f4a3..7e042d603 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -190,7 +190,7 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
 #endif
 #if defined(HAS_ARGBTOBAYERROW_NEON)
 BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
-         3, 4, 1)
+         7, 4, 1)
 #endif
 #undef BAYERANY
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 4796ae3b4..6f075a84d 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -1176,18 +1176,20 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                          uint32 selector, int pix) {
   asm volatile (
-    "vmov.u32   d2[0], %3                      \n"  // selector
+    "vmov.u32   d6[0], %3                      \n"  // selector
   "1:                                          \n"
-    "vld1.u8    {q0}, [%0]!                    \n"  // load row 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d3, {d0, d1}, d2               \n"  // look up 4 pixels
-    "vst1.u32   {d3[0]}, [%1]!                 \n"  // store 4.
+    "vld1.u8    {q0, q1}, [%0]!                \n"  // load row 8 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
+    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
+    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
+    "vst1.u8    {d4}, [%1]!                    \n"  // store 8.
     "bgt        1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
   : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1"  // Clobber List
+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
   );
 }
 
@@ -2595,6 +2597,134 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   );
 }
 
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d0, d0, d1                     \n"  // add
+    "vmov.u8    d1, d0                         \n"
+    "vmov.u8    d2, d0                         \n"
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "vmov.u8    d3, #255                       \n"  // alpha
+    // 8 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vqadd.u8   d1, d0, d2                     \n"  // add
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {d0}, [%0],%5                  \n"  // top
+    "vld1.u8    {d1}, [%0],%6                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    "vld1.u8    {d2}, [%1],%5                  \n"  // center * 2
+    "vld1.u8    {d3}, [%1],%6                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vld1.u8    {d2}, [%2],%5                  \n"  // bottom
+    "vld1.u8    {d3}, [%2],%6                  \n"
+    "subs       %4, %4, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    "vst1.u8    {d0}, [%3]!                    \n"  // store 8 sobelx
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  : "r"(2),            // %5
+    "r"(6)             // %6
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
+  asm volatile (
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {d0}, [%0],%4                  \n"  // left
+    "vld1.u8    {d1}, [%1],%4                  \n"
+    "vsubl.u8   q0, d0, d1                     \n"
+    "vld1.u8    {d2}, [%0],%4                  \n"  // center * 2
+    "vld1.u8    {d3}, [%1],%4                  \n"
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vld1.u8    {d2}, [%0],%5                  \n"  // right
+    "vld1.u8    {d3}, [%1],%5                  \n"
+    "subs       %3, %3, #8                     \n"  // 8 pixels
+    "vsubl.u8   q1, d2, d3                     \n"
+    "vadd.s16   q0, q0, q1                     \n"
+    "vabs.s16   q0, q0                         \n"
+    "vqmovn.u16 d0, q0                         \n"
+    "vst1.u8    {d0}, [%2]!                    \n"  // store 8 sobely
+    "bgt        1b                             \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  : "r"(1),            // %4
+    "r"(6)             // %5
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index efcd33dc0..469647667 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -977,6 +977,11 @@ TEST_F(libyuvTest, TestSobelX) {
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelXRow = SobelXRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXRow = SobelXRow_NEON;
+  }
 #endif
   for (int i = 0; i < benchmark_pixels_div256_; ++i) {
     SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
@@ -1009,6 +1014,11 @@ TEST_F(libyuvTest, TestSobelY) {
   if (TestCpuFlag(kCpuHasSSSE3)) {
     SobelYRow = SobelYRow_SSSE3;
   }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelYRow = SobelYRow_NEON;
+  }
 #endif
   for (int i = 0; i < benchmark_pixels_div256_; ++i) {
     SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
@@ -1048,6 +1058,11 @@ TEST_F(libyuvTest, TestSobel) {
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelRow = SobelRow_SSE2;
   }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelRow = SobelRow_NEON;
+  }
 #endif
   for (int i = 0; i < benchmark_pixels_div256_; ++i) {
     SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
@@ -1083,6 +1098,11 @@ TEST_F(libyuvTest, TestSobelXY) {
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXYRow = SobelXYRow_SSE2;
   }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelXYRow = SobelXYRow_NEON;
+  }
 #endif
   for (int i = 0; i < benchmark_pixels_div256_; ++i) {
     SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);