From 9d48df9ac434cf036bd8ec648fc040756a382c60 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Sun, 24 Mar 2013 20:12:25 +0000
Subject: [PATCH] Sobel port to posix.  Improved unittest for C version.
 BUG=201 TESTED=try bots Review URL:
 https://webrtc-codereview.appspot.com/1242004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@619 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium            |   2 +-
 include/libyuv/row.h       |   8 +-
 include/libyuv/version.h   |   2 +-
 source/planar_functions.cc |   1 -
 source/row_posix.cc        | 211 +++++++++++++++++++++++++++++++++++++
 unit_test/planar_test.cc   |  85 +++++++--------
 6 files changed, 256 insertions(+), 53 deletions(-)

diff --git a/README.chromium b/README.chromium
index a678b67f0..31fc32edf 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 618
+Version: 619
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index d5a0452b7..0d3848974 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -118,6 +118,10 @@ extern "C" {
 #define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELXROW_SSSE3
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSSE3
 #endif
 
 // The following are Windows only.
@@ -125,10 +129,6 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_COPYROW_AVX2
-#define HAS_SOBELXROW_SSSE3
-#define HAS_SOBELYROW_SSSE3
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELXYROW_SSE2
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 42062b779..93df7444d 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 618
+#define LIBYUV_VERSION 619
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 68567077b..d88dc60c3 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1970,7 +1970,6 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
     SobelXRow = SobelXRow_SSSE3;
   }
 #endif
-
   void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 471f94585..e26303e80 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4067,6 +4067,217 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
+#ifdef HAS_SOBELXROW_SSSE3
+// SobelX as a matrix is
+// -1  0  1
+// -2  0  2
+// -1  0  1
+void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+                     const uint8* src_y2, uint8* dst_sobelx, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "sub       %0,%3                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "movq      0x2(%0),%%xmm1                  \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      (%0,%1,1),%%xmm1                \n"
+    "movq      0x2(%0,%1,1),%%xmm2             \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      (%0,%2,1),%%xmm2                \n"
+    "movq      0x2(%0,%2,1),%%xmm3             \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pabsw     %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "sub       $0x8,%4                         \n"
+    "movq      %%xmm0,(%0,%3,1)                \n"
+    "lea       0x8(%0),%0                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(src_y2),      // %2
+    "+r"(dst_sobelx),  // %3
+    "+r"(width)        // %4
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_SOBELXROW_SSSE3
+
+#ifdef HAS_SOBELYROW_SSSE3
+// SobelY as a matrix is
+// -1 -2 -1
+//  0  0  0
+//  1  2  1
+void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+                     uint8* dst_sobely, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "sub       %0,%2                           \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movq      (%0),%%xmm0                     \n"
+    "movq      (%0,%1,1),%%xmm1                \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "psubw     %%xmm1,%%xmm0                   \n"
+    "movq      0x1(%0),%%xmm1                  \n"
+    "movq      0x1(%0,%1,1),%%xmm2             \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "psubw     %%xmm2,%%xmm1                   \n"
+    "movq      0x2(%0),%%xmm2                  \n"
+    "movq      0x2(%0,%1,1),%%xmm3             \n"
+    "punpcklbw %%xmm5,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "psubw     %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm2,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "paddw     %%xmm1,%%xmm0                   \n"
+    "pabsw     %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "sub       $0x8,%3                         \n"
+    "movq      %%xmm0,(%0,%2,1)                \n"
+    "lea       0x8(%0),%0                      \n"
+    "jg        1b                              \n"
+  : "+r"(src_y0),      // %0
+    "+r"(src_y1),      // %1
+    "+r"(dst_sobely),  // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_SOBELYROW_SSSE3
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    (%0,%1,1),%%xmm1                \n"
+    "lea       0x10(%0),%0                     \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "punpcklbw %%xmm0,%%xmm2                   \n"
+    "punpckhbw %%xmm0,%%xmm0                   \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "punpcklwd %%xmm2,%%xmm1                   \n"
+    "punpckhwd %%xmm2,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm1                   \n"
+    "por       %%xmm5,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklwd %%xmm0,%%xmm3                   \n"
+    "punpckhwd %%xmm0,%%xmm0                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "por       %%xmm5,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movdqa    %%xmm1,(%2)                     \n"
+    "movdqa    %%xmm2,0x10(%2)                 \n"
+    "movdqa    %%xmm3,0x20(%2)                 \n"
+    "movdqa    %%xmm0,0x30(%2)                 \n"
+    "lea       0x40(%2),%2                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+    // 8 pixel loop.
+    ".p2align  4                               \n"
+  "1:                                          \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    (%0,%1,1),%%xmm1                \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "paddusb   %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm0,%%xmm3                   \n"
+    "punpcklbw %%xmm5,%%xmm3                   \n"
+    "punpckhbw %%xmm5,%%xmm0                   \n"
+    "movdqa    %%xmm1,%%xmm4                   \n"
+    "punpcklbw %%xmm2,%%xmm4                   \n"
+    "punpckhbw %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm6                   \n"
+    "punpcklwd %%xmm3,%%xmm6                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "punpcklwd %%xmm0,%%xmm7                   \n"
+    "punpckhwd %%xmm0,%%xmm1                   \n"
+    "sub       $0x10,%3                        \n"
+    "movdqa    %%xmm6,(%2)                     \n"
+    "movdqa    %%xmm4,0x10(%2)                 \n"
+    "movdqa    %%xmm7,0x20(%2)                 \n"
+    "movdqa    %%xmm1,0x30(%2)                 \n"
+    "lea       0x40(%2),%2                     \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_SOBELXYROW_SSE2
+
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 029e2d03b..94284b66b 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -896,6 +896,7 @@ TEST_F(libyuvTest, TestSobelX) {
   SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
   SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]);
   SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
 
   for (int i = 0; i < 256 + 2; ++i) {
     orig_pixels_0[i] = i;
@@ -909,30 +910,29 @@ TEST_F(libyuvTest, TestSobelX) {
   EXPECT_EQ(16u, sobel_pixels_c[0]);
   EXPECT_EQ(16u, sobel_pixels_c[100]);
   EXPECT_EQ(255u, sobel_pixels_c[255]);
+
+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobely, int width) =
+      SobelXRow_C;
 #if defined(HAS_SOBELXROW_SSSE3)
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  if (has_ssse3) {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelXRow_SSSE3(orig_pixels_0, orig_pixels_1, orig_pixels_2,
-                      sobel_pixels_opt, 256);
-    }
-  } else {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2,
-                  sobel_pixels_opt, 256);
-    }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SobelXRow = SobelXRow_SSSE3;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
+              sobel_pixels_opt, 256);
   }
   for (int i = 0; i < 256; ++i) {
     EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
   }
-#endif
 }
 
 TEST_F(libyuvTest, TestSobelY) {
   SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
   SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
   SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
 
   for (int i = 0; i < 256 + 2; ++i) {
     orig_pixels_0[i] = i;
@@ -944,28 +944,26 @@ TEST_F(libyuvTest, TestSobelY) {
   EXPECT_EQ(4u, sobel_pixels_c[0]);
   EXPECT_EQ(255u, sobel_pixels_c[100]);
   EXPECT_EQ(0u, sobel_pixels_c[255]);
+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) = SobelYRow_C;
 #if defined(HAS_SOBELYROW_SSSE3)
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  if (has_ssse3) {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelYRow_SSSE3(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
-    }
-  } else {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
-    }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SobelYRow = SobelYRow_SSSE3;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
   }
   for (int i = 0; i < 256; ++i) {
     EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
   }
-#endif
 }
 
 TEST_F(libyuvTest, TestSobel) {
   SIMD_ALIGNED(uint8 orig_sobelx[256]);
   SIMD_ALIGNED(uint8 orig_sobely[256]);
   SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
 
   for (int i = 0; i < 256; ++i) {
     orig_sobelx[i] = i;
@@ -985,28 +983,26 @@ TEST_F(libyuvTest, TestSobel) {
   EXPECT_EQ(255u, sobel_pixels_c[7]);
   EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
   EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
-  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-  if (has_sse2) {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelRow_SSE2(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
-    }
-  } else {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
-    }
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelRow = SobelRow_SSE2;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
   }
   for (int i = 0; i < 16; ++i) {
     EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
   }
-#endif
 }
 
 TEST_F(libyuvTest, TestSobelXY) {
   SIMD_ALIGNED(uint8 orig_sobelx[256]);
   SIMD_ALIGNED(uint8 orig_sobely[256]);
   SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
 
   for (int i = 0; i < 256; ++i) {
     orig_sobelx[i] = i;
@@ -1022,22 +1018,19 @@ TEST_F(libyuvTest, TestSobelXY) {
   EXPECT_EQ(255u, sobel_pixels_c[7]);
   EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
   EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
-  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-  if (has_sse2) {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelXYRow_SSE2(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
-    }
-  } else {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-      SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
-    }
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXYRow = SobelXYRow_SSE2;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
   }
   for (int i = 0; i < 16; ++i) {
     EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
   }
-#endif
 }
 
 TEST_F(libyuvTest, TestCopyPlane) {