diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 743f6b154..16e0fd834 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -37,7 +37,7 @@ extern "C" {
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
+// define LIBYUV_DISABLE_X86
 #endif
 #endif
 // True if compiling for SSSE3 as a requirement.
@@ -268,6 +268,7 @@ extern "C" {
 // TODO(fbarchard): Port to Visual C
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #endif
@@ -1541,11 +1542,23 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
                          int width);
 void MultiplyRow_16_C(const uint16* src_y, uint16* dst_y, int scale, int width);
 
+void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width);
 void Convert16To8Row_AVX2(const uint16* src_y,
                           uint8* dst_y,
                           int scale,
                           int width);
-void Convert16To8Row_C(const uint16* src_y, uint8* dst_y, int scale, int width);
+void Convert16To8Row_Any_SSSE3(const uint16* src_y,
+                               uint8* dst_y,
+                               int scale,
+                               int width);
+void Convert16To8Row_Any_AVX2(const uint16* src_y,
+                              uint8* dst_y,
+                              int scale,
+                              int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
 void CopyRow_AVX(const uint8* src, uint8* dst, int count);
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 8875da57a..0fdaf6c5e 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -462,15 +462,22 @@ static int H010ToAR30Matrix(const uint16* src_y,
     dst_stride_ar30 = -dst_stride_ar30;
   }
 
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
 #if defined(HAS_CONVERT16TO8ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    Convert16To8Row = Convert16To8Row_C;  // TODO(fbarchard): Any AVX2
-    if (IS_ALIGNED(width, 64)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
       Convert16To8Row = Convert16To8Row_AVX2;
     }
   }
 #endif
-
 #if defined(HAS_ARGBTOAR30ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
@@ -479,7 +486,6 @@ static int H010ToAR30Matrix(const uint16* src_y,
     }
   }
 #endif
-
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
diff --git a/source/row_any.cc b/source/row_any.cc
index 4f1877656..940f13983 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -732,10 +732,34 @@ ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8*, 4, 4, 7)
 #undef ANY11P
 
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                            \
+  void NAMEANY(const uint16* src_ptr, uint8* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(uint16 temp[32]);                                            \
+    SIMD_ALIGNED(uint8 out[32]);                                              \
+    memset(temp, 0, 64); /* for msan */                                       \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                   \
+    }                                                                         \
+    memcpy(temp, src_ptr + n, r * SBPP);                                      \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                     \
+    memcpy(dst_ptr + n, out, r * BPP);                                        \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, 15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, 31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                      \
   void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, T param, int width) { \
-    SIMD_ALIGNED(uint16 temp[16 * 2]);                                       \
-    memset(temp, 0, 32); /* for msan */                                      \
+    SIMD_ALIGNED(uint16 temp[32 * 2]);                                       \
+    memset(temp, 0, 64); /* for msan */                                      \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
     if (n > 0) {                                                             \
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 45d287faa..a66da6fab 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -2894,6 +2894,37 @@ void MultiplyRow_16_AVX2(const uint16* src_y,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16* src_y,
+                           uint8* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm3                      \n"
+    "punpcklwd %%xmm3,%%xmm3                  \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "pmulhuw   %%xmm3,%%xmm0                  \n"
+    "pmulhuw   %%xmm3,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x20,%0                       \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+
 #ifdef HAS_MULTIPLYROW_16_AVX2
 void Convert16To8Row_AVX2(const uint16* src_y,
                           uint8* dst_y,
diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc
index ff39b2b0f..1c6d988ef 100644
--- a/unit_test/compare_test.cc
+++ b/unit_test/compare_test.cc
@@ -338,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64;  // 536870848
 
 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   uint32 h1 = 0;
-  const int kMaxWidth = benchmark_width_ * benchmark_height_;
+  const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 31) & ~31;
   align_buffer_page_end(src_a, kMaxWidth);
   align_buffer_page_end(src_b, kMaxWidth);
   memset(src_a, 255u, kMaxWidth);
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 8bcb63d3c..b0bbb590a 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -1966,63 +1966,73 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
 // Alias to copy pixels as is
 #define AR30ToAR30 ARGBToARGB
 
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         ALIGN, YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C,       \
-                         BPP_C)                                                \
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
-    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                   \
-    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                      \
-    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                        \
-    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);             \
-    const int kBpc = 2;                                                        \
-    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF);                 \
-    align_buffer_page_end(src_u, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(src_v, kSizeUV* kBpc + OFF);                         \
-    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF);                \
-    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF);              \
-    for (int i = 0; i < kWidth * kHeight; ++i) {                               \
-      reinterpret_cast<uint16*>(src_y)[i + OFF] = (fastrand() & 0x3ff);        \
-    }                                                                          \
-    for (int i = 0; i < kSizeUV; ++i) {                                        \
-      reinterpret_cast<uint16*>(src_u)[i + OFF] = (fastrand() & 0x3ff);        \
-      reinterpret_cast<uint16*>(src_v)[i + OFF] = (fastrand() & 0x3ff);        \
-    }                                                                          \
-    memset(dst_argb_c + OFF, 1, kStrideB * kHeight);                           \
-    memset(dst_argb_opt + OFF, 101, kStrideB * kHeight);                       \
-    MaskCpuFlags(disable_cpu_flags_);                                          \
-    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,      \
-                          reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV,   \
-                          reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV,   \
-                          dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight);    \
-    MaskCpuFlags(benchmark_cpu_info_);                                         \
-    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
-      FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y) + OFF, kWidth,    \
-                            reinterpret_cast<uint16*>(src_u) + OFF, kStrideUV, \
-                            reinterpret_cast<uint16*>(src_v) + OFF, kStrideUV, \
-                            dst_argb_opt + OFF, kStrideB, kWidth,              \
-                            NEG kHeight);                                      \
-    }                                                                          \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                       \
-      int abs_diff = abs(static_cast<int>(dst_argb_c[i]) -                     \
-                         static_cast<int>(dst_argb_opt[i]));                   \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_y);                                       \
-    free_aligned_buffer_page_end(src_u);                                       \
-    free_aligned_buffer_page_end(src_v);                                       \
-    free_aligned_buffer_page_end(dst_argb_c);                                  \
-    free_aligned_buffer_page_end(dst_argb_opt);                                \
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
+                         ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF,      \
+                         FMT_C, BPP_C)                                        \
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
+    const int kBpc = 2;                                                       \
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
+      reinterpret_cast<uint16*>(src_y + SOFF)[i] = (fastrand() & 0x3ff);      \
+    }                                                                         \
+    for (int i = 0; i < kSizeUV; ++i) {                                       \
+      reinterpret_cast<uint16*>(src_u + SOFF)[i] = (fastrand() & 0x3ff);      \
+      reinterpret_cast<uint16*>(src_v + SOFF)[i] = (fastrand() & 0x3ff);      \
+    }                                                                         \
+    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
+    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    FMT_PLANAR##To##FMT_B(reinterpret_cast<uint16*>(src_y + SOFF), kWidth,    \
+                          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV, \
+                          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV, \
+                          dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);  \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      FMT_PLANAR##To##FMT_B(                                                  \
+          reinterpret_cast<uint16*>(src_y + SOFF), kWidth,                    \
+          reinterpret_cast<uint16*>(src_u + SOFF), kStrideUV,                 \
+          reinterpret_cast<uint16*>(src_v + SOFF), kStrideUV,                 \
+          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
+    }                                                                         \
+    int max_diff = 0;                                                         \
+    for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                      \
+      int abs_diff = abs(static_cast<int>(dst_argb_c[i + DOFF]) -             \
+                         static_cast<int>(dst_argb_opt[i + DOFF]));           \
+      if (abs_diff > max_diff) {                                              \
+        max_diff = abs_diff;                                                  \
+      }                                                                       \
+    }                                                                         \
+    EXPECT_LE(max_diff, DIFF);                                                \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_u);                                      \
+    free_aligned_buffer_page_end(src_v);                                      \
+    free_aligned_buffer_page_end(dst_argb_c);                                 \
+    free_aligned_buffer_page_end(dst_argb_opt);                               \
   }
 
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                         YALIGN, DIFF, FMT_C, BPP_C)                            \
   TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+                   YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0, FMT_C,   \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 1, FMT_C, \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0, FMT_C,    \
+                   BPP_C)                                                      \
+  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0, FMT_C,       \
+                   BPP_C)
 
 TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2, AR30, 4)
 
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 151bcafd1..6e1c27cad 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -2720,10 +2720,14 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
                     dst_pixels_y_c, 16384, kPixels);
 
   int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
   for (int i = 0; i < benchmark_iterations_; ++i) {
     if (has_avx2) {
       Convert16To8Row_AVX2(reinterpret_cast<const uint16*>(src_pixels_y),
                            dst_pixels_y_opt, 16384, kPixels);
+    } else if (has_ssse3) {
+      Convert16To8Row_SSSE3(reinterpret_cast<const uint16*>(src_pixels_y),
+                            dst_pixels_y_opt, 16384, kPixels);
     } else {
       Convert16To8Row_C(reinterpret_cast<const uint16*>(src_pixels_y),
                         dst_pixels_y_opt, 16384, kPixels);