From 324fa32739bea6770b72ba9d647c25a4539b01ae Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 28 Nov 2017 10:30:55 -0800
Subject: [PATCH] Convert16To8Row_SSSE3 port from AVX2

H010ToAR30 uses Convert16To8Row_SSSE3 to convert 10 bit YUV to 8 bit.
Then standard YUV conversion can be used.  This improves performance
on low end CPUs.
Future CL will by pass this conversion allowing for 10 bit YUV source,
but the function will be useful as a utility for YUV conversions.

Bug: libyuv:559, libyuv:751
Test: out/Release/libyuv_unittest --gtest_filter=*H010ToAR30* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1
Change-Id: I9b3ef22d88a5fd861de4cf1900b4c6e8fd24d0af
Reviewed-on: https://chromium-review.googlesource.com/792334
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 include/libyuv/row.h      |  17 +++++-
 source/convert_argb.cc    |  14 +++--
 source/row_any.cc         |  28 +++++++++-
 source/row_gcc.cc         |  31 +++++++++++
 unit_test/compare_test.cc |   2 +-
 unit_test/convert_test.cc | 114 +++++++++++++++++++++-----------------
 unit_test/planar_test.cc  |   4 ++
 7 files changed, 149 insertions(+), 61 deletions(-)

diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 743f6b154..16e0fd834 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -37,7 +37,7 @@ extern "C" {
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
+// define LIBYUV_DISABLE_X86
 #endif
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -268,6 +268,7 @@ extern "C" {
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #endif
@@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
                          int width);
 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
 
+void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width);
 void Convert16To8Row_AVX2(const uint16* src_y,
                           uint8* dst_y,
                           int scale,
                           int width);
-void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_Any_SSSE3(const uint16* src_y,
+                               uint8* dst_y,
+                               int scale,
+                               int width);
+void Convert16To8Row_Any_AVX2(const uint16* src_y,
+                              uint8* dst_y,
+                              int scale,
+                              int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 8875da57a..0fdaf6c5e 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y,
     dst_stride_ar30 = -dst_stride_ar30;
   }
 
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_C;  // TODO(fbarchard): Any AVX2
-    if (IS_ALIGNED(width, 64)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
       Convert16To8Row = Convert16To8Row_AVX2;
     }
   }
 #endif
-
 #if defined(HAS_ARGBTOAR30ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
@@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y,
     }
   }
 #endif
-
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
diff --git a/source/row_any.cc b/source/row_any.cc
index 4f1877656..940f13983 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
 #undef ANY11P
 
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                            \
+  void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(uint16 temp[32]);                                            \
+    SIMD_ALIGNED(uint8 out[32]);                                              \
+    memset(temp, 0, 64); /* for msan */                                       \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                   \
+    }                                                                         \
+    memcpy(temp, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                     \
+    memcpy(dst_ptr + n, out, r * BPP);                                        \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                      \
   void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint16 temp[16 * 2]);                                       \
-    memset(temp, 0, 32); /* for msan */                                      \
+    SIMD_ALIGNED(uint16 temp[32 * 2]);                                       \
+    memset(temp, 0, 64); /* for msan */                                      \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 45d287faa..a66da6fab 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm3                      \n"
+    "punpcklwd %%xmm3,%%xmm3                  \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "pmulhuw   %%xmm3,%%xmm0                  \n"
+    "pmulhuw   %%xmm3,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x20,%0                       \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+
 #ifdef HAS_MULTIPLYROW_16_AVX2
 void Convert16To8Row_AVX2(const uint16* src_y,
                           uint8* dst_y,
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index ff39b2b0f..1c6d988ef 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64;  // 536870848
 
 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   uint32 h1 = 0;
-  const int kMaxWidth = benchmark_width_ * benchmark_height_;
+  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
   align_buffer_page_end(src_a, kMaxWidth);
   align_buffer_page_end(src_b, kMaxWidth);
   memset(src_a, 255u, kMaxWidth);
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 8bcb63d3c..b0bbb590a 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
 // Alias to copy pixels as is
 #define AR30ToAR30 ARGBToARGB
 
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C,       \
-                         BPP_C)                                                \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff);        \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff);        \
-      reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff);        \
-    }                                                                          \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,      \
-                          reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV,   \
-                          reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV,   \
-                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight);    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,    \
-                            reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
-                            reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
-                            dst_argb_opt + OFF, kStrideB, kWidth,              \
-                            NEG kHeight);                                      \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                     \
-                         static_cast<int>(dst_argb_opt[i]));                   \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
+                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF,      \
+                         FMT_C, BPP_C)                                        \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    const int kBpc = 2;                                                       \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      reinterpret_cast<uint16*>(src_y + SOFF)[i] = (fastrand() & 0x3ff);      \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      reinterpret_cast<uint16*>(src_u + SOFF)[i] = (fastrand() & 0x3ff);      \
+      reinterpret_cast<uint16*>(src_v + SOFF)[i] = (fastrand() & 0x3ff);      \
+    }                                                                         \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y + SOFF), kWidth,    \
+                          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
+                          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
+                          dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);  \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(                                                  \
+          reinterpret_cast<uint16*>(src_y + SOFF), kWidth,                    \
+          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV,                 \
+          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV,                 \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
+    }                                                                         \
+    int max_diff = 0;                                                         \
+    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                      \
+      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -             \
+                         static_cast<int>(dst_argb_opt[i + DOFF]));           \
+      if (abs_diff > max_diff) {                                              \
+        max_diff = abs_diff;                                                  \
+      }                                                                       \
+    }                                                                         \
+    EXPECT_LE(max_diff, DIFF);                                                \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
   }
 
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                         YALIGN, DIFF, FMT_C, BPP_C)                            \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+                   YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C,   \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C,    \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C,       \
+                   BPP_C)
 
 TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)
 
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 151bcafd1..6e1c27cad 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
                     dst_pixels_y_c, 16384, kPixels);
 
   int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
   for (int i = 0; i < benchmark_iterations_; ++i) {
     if (has_avx2) {
       Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
                            dst_pixels_y_opt, 16384, kPixels);
+    } else if (has_ssse3) {
+      Convert16To8Row_SSSE3(reinterpret_cast<const uint16*>(src_pixels_y),
+                            dst_pixels_y_opt, 16384, kPixels);
     } else {
       Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
                         dst_pixels_y_opt, 16384, kPixels);