Merge/SplitRGB fix -mcmodel=large x86 and InterpolateRow_16To8_NEON

MergeRGB and SplitRGB use a register to point to 9 shuffle tables. - fixes an out of registers error with -mcmodel=large InterpolateRow_16To8_NEON improves performance for I210ToI420: On Pixel 4 for 720p x1000 images Was I210ToI420_Opt (608 ms) Now I210ToI420_Opt (336 ms) On Skylake Xeon Was I210ToI420_Opt (259 ms) Now I210ToI420_Opt (209 ms) Bug: libyuv:931, libyuv:930 Change-Id: I20f8244803f06da511299bf1a2ffc7945eb35221 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3717054 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
2026-01-01 03:12:16 +08:00 · 2022-06-28 16:31:22 -07:00 · 2022-06-28 16:31:22 -07:00 · 6900494d90
commit 6900494d90
parent fe4a50df8e
8 changed files with 339 additions and 181 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1831
+Version: 1832
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -348,6 +348,7 @@ extern "C" {
 #define HAS_AR64TOARGBROW_AVX2
 #define HAS_AB64TOARGBROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_INTERPOLATEROW_16TO8_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
@ -539,6 +540,7 @@ extern "C" {

 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_INTERPOLATEROW_16TO8_NEON
 #define HAS_SCALESUMSAMPLES_NEON
 #define HAS_GAUSSROW_F32_NEON
 #define HAS_GAUSSCOL_F32_NEON
@ -5221,6 +5223,30 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
                            int scale,
                            int width,
                            int source_y_fraction);
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction);
+void InterpolateRow_16To8_Any_NEON(uint8_t* dst_ptr,
+                                   const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int scale,
+                                   int width,
+                                   int source_y_fraction);
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction);
+void InterpolateRow_16To8_Any_AVX2(uint8_t* dst_ptr,
+                                   const uint16_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   int scale,
+                                   int width,
+                                   int source_y_fraction);

 // Sobel images.
 void SobelXRow_C(const uint8_t* src_y0,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1831
+#define LIBYUV_VERSION 1832

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1625,46 +1625,100 @@ ANY11C(UYVYToARGBRow_Any_LSX, UYVYToARGBRow_LSX, 1, 4, 4, 7)
 #undef ANY11C

 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11I(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                         \
-  void NAMEANY(T* dst_ptr, const T* src_ptr, ptrdiff_t src_stride, int width, \
-               int source_y_fraction) {                                       \
-    SIMD_ALIGNED(T temp[64 * 3]);                                             \
-    memset(temp, 0, 64 * 2 * sizeof(T)); /* for msan */                       \
-    int r = width & MASK;                                                     \
-    int n = width & ~MASK;                                                    \
-    if (n > 0) {                                                              \
-      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);           \
-    }                                                                         \
-    memcpy(temp, src_ptr + n * SBPP, r * SBPP * sizeof(T));                   \
-    if (source_y_fraction) {                                                  \
-      memcpy(temp + 64, src_ptr + src_stride + n * SBPP,                      \
-             r * SBPP * sizeof(T));                                           \
-    }                                                                         \
-    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);              \
-    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP * sizeof(T));               \
+#define ANY11I(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)           \
+  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride, \
+               int width, int source_y_fraction) {                   \
+    SIMD_ALIGNED(TS temps[64 * 2]);                                  \
+    SIMD_ALIGNED(TD tempd[64]);                                      \
+    memset(temps, 0, sizeof(temps)); /* for msan */                  \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction);  \
+    }                                                                \
+    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));        \
+    if (source_y_fraction) {                                         \
+      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,            \
+             r * SBPP * sizeof(TS));                                 \
+    }                                                                \
+    ANY_SIMD(tempd, temps, 64, MASK + 1, source_y_fraction);         \
+    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));          \
  }

 #ifdef HAS_INTERPOLATEROW_AVX2
-ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, 1, 1, 31)
+ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, uint8_t, uint8_t, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, uint8_t, 1, 1, 15)
+ANY11I(InterpolateRow_Any_SSSE3,
+       InterpolateRow_SSSE3,
+       uint8_t,
+       uint8_t,
+       1,
+       1,
+       15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, 1, 1, 15)
+ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, uint8_t, uint8_t, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MSA
-ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, 1, 1, 31)
+ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, uint8_t, uint8_t, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_LSX
-ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, 1, 1, 31)
+ANY11I(InterpolateRow_Any_LSX, InterpolateRow_LSX, uint8_t, uint8_t, 1, 1, 31)
 #endif

 #ifdef HAS_INTERPOLATEROW_16_NEON
-ANY11I(InterpolateRow_16_Any_NEON, InterpolateRow_16_NEON, uint16_t, 1, 1, 7)
+ANY11I(InterpolateRow_16_Any_NEON,
+       InterpolateRow_16_NEON,
+       uint16_t,
+       uint16_t,
+       1,
+       1,
+       7)
+#endif
+#undef ANY11I
+
+// Any 1 to 1 interpolate with scale param
+#define ANY11IS(NAMEANY, ANY_SIMD, TD, TS, SBPP, BPP, MASK)                \
+  void NAMEANY(TD* dst_ptr, const TS* src_ptr, ptrdiff_t src_stride,       \
+               int scale, int width, int source_y_fraction) {              \
+    SIMD_ALIGNED(TS temps[64 * 2]);                                        \
+    SIMD_ALIGNED(TD tempd[64]);                                            \
+    memset(temps, 0, sizeof(temps)); /* for msan */                        \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride, scale, n, source_y_fraction); \
+    }                                                                      \
+    memcpy(temps, src_ptr + n * SBPP, r * SBPP * sizeof(TS));              \
+    if (source_y_fraction) {                                               \
+      memcpy(temps + 64, src_ptr + src_stride + n * SBPP,                  \
+             r * SBPP * sizeof(TS));                                       \
+    }                                                                      \
+    ANY_SIMD(tempd, temps, 64, scale, MASK + 1, source_y_fraction);        \
+    memcpy(dst_ptr + n * BPP, tempd, r * BPP * sizeof(TD));                \
+  }
+
+#ifdef HAS_INTERPOLATEROW_16TO8_NEON
+ANY11IS(InterpolateRow_16To8_Any_NEON,
+        InterpolateRow_16To8_NEON,
+        uint8_t,
+        uint16_t,
+        1,
+        1,
+        7)
+#endif
+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+ANY11IS(InterpolateRow_16To8_Any_AVX2,
+        InterpolateRow_16To8_AVX2,
+        uint8_t,
+        uint16_t,
+        1,
+        1,
+        31)
 #endif

-#undef ANY11I
+#undef ANY11IS

 // Any 1 to 1 mirror.
 #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2985,6 +2985,9 @@ void DivideRow_16_C(const uint16_t* src_y,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+#define C16TO8(v, scale) clamp255(((v) * (scale)) >> 16)
+
 void Convert16To8Row_C(const uint16_t* src_y,
                       uint8_t* dst_y,
                       int scale,
@ -2994,7 +2997,7 @@ void Convert16To8Row_C(const uint16_t* src_y,
  assert(scale <= 32768);

  for (x = 0; x < width; ++x) {
-    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+    dst_y[x] = C16TO8(src_y[x], scale);
  }
 }

@ -3411,8 +3414,7 @@ static void HalfRow_16To8_C(const uint16_t* src_uv,
                            int width) {
  int x;
  for (x = 0; x < width; ++x) {
-    dst_uv[x] = clamp255(
-        (((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1) * scale) >> 16);
+    dst_uv[x] = C16TO8((src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1, scale);
  }
 }

@ -3426,6 +3428,9 @@ void InterpolateRow_C(uint8_t* dst_ptr,
  int y0_fraction = 256 - y1_fraction;
  const uint8_t* src_ptr1 = src_ptr + src_stride;
  int x;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
  if (y1_fraction == 0) {
    memcpy(dst_ptr, src_ptr, width);
    return;
@ -3434,18 +3439,42 @@ void InterpolateRow_C(uint8_t* dst_ptr,
    HalfRow_C(src_ptr, src_stride, dst_ptr, width);
    return;
  }
-  for (x = 0; x < width - 1; x += 2) {
+  for (x = 0; x < width; ++x) {
    dst_ptr[0] =
        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
-    dst_ptr[1] =
-        (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
+    ++src_ptr;
+    ++src_ptr1;
+    ++dst_ptr;
  }
-  if (width & 1) {
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int x;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
+  if (y1_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width * 2);
+    return;
+  }
+  if (y1_fraction == 128) {
+    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
+    return;
+  }
+  for (x = 0; x < width; ++x) {
    dst_ptr[0] =
        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8;
+    ++src_ptr;
+    ++src_ptr1;
+    ++dst_ptr;
  }
 }

@ -3455,6 +3484,8 @@ void InterpolateRow_C(uint8_t* dst_ptr,
 // 16384 = 10 bits
 // 4096 = 12 bits
 // 256 = 16 bits
+// TODO(fbarchard): change scale to bits
+
 void InterpolateRow_16To8_C(uint8_t* dst_ptr,
                            const uint16_t* src_ptr,
                            ptrdiff_t src_stride,
@ -3465,6 +3496,9 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
  int y0_fraction = 256 - y1_fraction;
  const uint16_t* src_ptr1 = src_ptr + src_stride;
  int x;
+  assert(source_y_fraction >= 0);
+  assert(source_y_fraction < 256);
+
  if (source_y_fraction == 0) {
    Convert16To8Row_C(src_ptr, dst_ptr, scale, width);
    return;
@ -3473,53 +3507,13 @@ void InterpolateRow_16To8_C(uint8_t* dst_ptr,
    HalfRow_16To8_C(src_ptr, src_stride, dst_ptr, scale, width);
    return;
  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = clamp255(
-        (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
-         scale) >>
-        16);
-    dst_ptr[1] = clamp255(
-        (((src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8) *
-         scale) >>
-        16);
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] = clamp255(
-        (((src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8) *
-         scale) >>
-        16);
-  }
-}
-
-void InterpolateRow_16_C(uint16_t* dst_ptr,
-                         const uint16_t* src_ptr,
-                         ptrdiff_t src_stride,
-                         int width,
-                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
-  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  int x;
-  if (source_y_fraction == 0) {
-    memcpy(dst_ptr, src_ptr, width * 2);
-    return;
-  }
-  if (source_y_fraction == 128) {
-    HalfRow_16_C(src_ptr, src_stride, dst_ptr, width);
-    return;
-  }
-  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
-    src_ptr += 2;
-    src_ptr1 += 2;
-    dst_ptr += 2;
-  }
-  if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+  for (x = 0; x < width; ++x) {
+    dst_ptr[0] = C16TO8(
+        (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8,
+        scale);
+    src_ptr += 1;
+    src_ptr1 += 1;
+    dst_ptr += 1;
  }
 }

@ -4124,6 +4118,26 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
 }
 #endif  // HAS_RAWTOYJROW_SSSE3

+#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
+void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int width,
+                               int source_y_fraction) {
+  // Row buffer for intermediate 16 bit pixels.
+  SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
+    Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
+    src_ptr += twidth;
+    dst_ptr += twidth;
+    width -= twidth;
+  }
+}
+#endif  // HAS_INTERPOLATEROW_16TO8_AVX2
+
 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
  float fsum = 0.f;
  int i;
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -5198,37 +5198,26 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
 #endif  // HAS_CONVERT8TO16ROW_AVX2

 #ifdef HAS_SPLITRGBROW_SSSE3
-
 // Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u};
-
-static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
-                                          3u,   6u,   9u,   12u,  15u,  128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 128u, 2u,
-                                          5u,   8u,   11u,  14u};
-
-static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
-                                          128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
-                                          4u,   7u,   10u,  13u,  128u, 128u,
-                                          128u, 128u, 128u, 128u};
-static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
-                                          128u, 128u, 128u, 128u, 0u,   3u,
-                                          6u,   9u,   12u,  15u};
+static const uvec8 kSplitRGBShuffle[9] = {
+    {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u,
+     7u, 10u, 13u},
+    {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u,
+     8u, 11u, 14u},
+    {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u,
+     128u, 128u},
+    {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u,
+     12u, 15u}};

 void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                       uint8_t* dst_r,
@ -5242,9 +5231,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %5, %%xmm0                    \n"
-      "pshufb      %6, %%xmm1                    \n"
-      "pshufb      %7, %%xmm2                    \n"
+      "pshufb      0(%5), %%xmm0                 \n"
+      "pshufb      16(%5), %%xmm1                \n"
+      "pshufb      32(%5), %%xmm2                \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,(%1)                   \n"
@ -5253,9 +5242,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %8, %%xmm0                    \n"
-      "pshufb      %9, %%xmm1                    \n"
-      "pshufb      %10, %%xmm2                   \n"
+      "pshufb      48(%5),%%xmm0                 \n"
+      "pshufb      64(%5),%%xmm1                 \n"
+      "pshufb      80(%5), %%xmm2                \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,(%2)                   \n"
@ -5264,9 +5253,9 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
      "movdqu      0x20(%0),%%xmm2               \n"
-      "pshufb      %11, %%xmm0                   \n"
-      "pshufb      %12, %%xmm1                   \n"
-      "pshufb      %13, %%xmm2                   \n"
+      "pshufb      96(%5), %%xmm0                \n"
+      "pshufb      112(%5), %%xmm1               \n"
+      "pshufb      128(%5), %%xmm2               \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,(%3)                   \n"
@ -5279,51 +5268,32 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
        "+r"(dst_g),               // %2
        "+r"(dst_b),               // %3
        "+r"(width)                // %4
-      : "m"(kShuffleMaskRGBToR0),  // %5
-        "m"(kShuffleMaskRGBToR1),  // %6
-        "m"(kShuffleMaskRGBToR2),  // %7
-        "m"(kShuffleMaskRGBToG0),  // %8
-        "m"(kShuffleMaskRGBToG1),  // %9
-        "m"(kShuffleMaskRGBToG2),  // %10
-        "m"(kShuffleMaskRGBToB0),  // %11
-        "m"(kShuffleMaskRGBToB1),  // %12
-        "m"(kShuffleMaskRGBToB2)   // %13
+      : "r"(&kSplitRGBShuffle[0])  // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_SPLITRGBROW_SSSE3

 #ifdef HAS_MERGERGBROW_SSSE3
-
-// Shuffle table for converting RGB to Planar.
-static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
-                                          2u, 128u, 128u, 3u, 128u, 128u,
-                                          4u, 128u, 128u, 5u};
-static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
-                                          128u, 2u, 128u, 128u, 3u, 128u,
-                                          128u, 4u, 128u, 128u};
-static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
-                                          128u, 128u, 2u, 128u, 128u, 3u,
-                                          128u, 128u, 4u, 128u};
-
-static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
-                                          7u, 128u, 128u, 8u, 128u, 128u,
-                                          9u, 128u, 128u, 10u};
-static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
-                                          128u, 7u, 128u, 128u, 8u, 128u,
-                                          128u, 9u, 128u, 128u};
-static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
-                                          128u, 128u, 8u,  128u, 128u, 9u,
-                                          128u, 128u, 10u, 128u};
-
-static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
-                                          12u, 128u, 128u, 13u, 128u, 128u,
-                                          14u, 128u, 128u, 15u};
-static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
-                                          128u, 13u, 128u, 128u, 14u, 128u,
-                                          128u, 15u, 128u, 128u};
-static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
-                                          128u, 128u, 13u, 128u, 128u, 14u,
-                                          128u, 128u, 15u, 128u};
+// Shuffle table for converting Planar to RGB.
+static const uvec8 kMergeRGBShuffle[9] = {
+    {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u,
+     128u, 5u},
+    {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u,
+     128u, 128u},
+    {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u,
+     4u, 128u},
+    {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u,
+     10u, 128u},
+    {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u,
+     128u, 10u},
+    {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u,
+     128u, 128u},
+    {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u,
+     15u, 128u, 128u},
+    {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u,
+     128u, 15u, 128u},
+    {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u,
+     128u, 128u, 15u}};

 void MergeRGBRow_SSSE3(const uint8_t* src_r,
                       const uint8_t* src_g,
@ -5337,9 +5307,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      (%1),%%xmm1                   \n"
      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %5, %%xmm0                    \n"
-      "pshufb      %6, %%xmm1                    \n"
-      "pshufb      %7, %%xmm2                    \n"
+      "pshufb      (%5), %%xmm0                  \n"
+      "pshufb      16(%5), %%xmm1                \n"
+      "pshufb      32(%5), %%xmm2                \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,(%3)                   \n"
@ -5347,9 +5317,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      (%1),%%xmm1                   \n"
      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %8, %%xmm0                    \n"
-      "pshufb      %9, %%xmm1                    \n"
-      "pshufb      %10, %%xmm2                   \n"
+      "pshufb      48(%5), %%xmm0                \n"
+      "pshufb      64(%5), %%xmm1                \n"
+      "pshufb      80(%5), %%xmm2                \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,16(%3)                 \n"
@ -5357,9 +5327,9 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      (%1),%%xmm1                   \n"
      "movdqu      (%2),%%xmm2                   \n"
-      "pshufb      %11, %%xmm0                   \n"
-      "pshufb      %12, %%xmm1                   \n"
-      "pshufb      %13, %%xmm2                   \n"
+      "pshufb      96(%5), %%xmm0                \n"
+      "pshufb      112(%5), %%xmm1               \n"
+      "pshufb      128(%5), %%xmm2               \n"
      "por         %%xmm1,%%xmm0                 \n"
      "por         %%xmm2,%%xmm0                 \n"
      "movdqu      %%xmm0,32(%3)                 \n"
@ -5375,15 +5345,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
        "+r"(src_b),               // %2
        "+r"(dst_rgb),             // %3
        "+r"(width)                // %4
-      : "m"(kShuffleMaskRToRGB0),  // %5
-        "m"(kShuffleMaskGToRGB0),  // %6
-        "m"(kShuffleMaskBToRGB0),  // %7
-        "m"(kShuffleMaskRToRGB1),  // %8
-        "m"(kShuffleMaskGToRGB1),  // %9
-        "m"(kShuffleMaskBToRGB1),  // %10
-        "m"(kShuffleMaskRToRGB2),  // %11
-        "m"(kShuffleMaskGToRGB2),  // %12
-        "m"(kShuffleMaskBToRGB2)   // %13
+      : "r"(&kMergeRGBShuffle[0])  // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGERGBROW_SSSE3
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -3031,6 +3031,86 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
 }

+// Bilinear filter 8x2 -> 8x1
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
+                               const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               int scale,
+                               int dst_width,
+                               int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
+  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+
+  asm volatile(
+      "dup         v6.8h, %w6                    \n"
+      "cmp         %w4, #0                       \n"
+      "b.eq        100f                          \n"
+      "cmp         %w4, #128                     \n"
+      "b.eq        50f                           \n"
+
+      "dup         v5.8h, %w4                    \n"
+      "dup         v4.8h, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "umull       v2.4s, v0.4h, v4.4h           \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "umull2      v3.4s, v0.8h, v4.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "umlal       v2.4s, v1.4h, v5.4h           \n"
+      "umlal2      v3.4s, v1.8h, v5.8h           \n"
+      "rshrn       v0.4h, v2.4s, #8              \n"
+      "rshrn2      v0.8h, v3.4s, #8              \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        1b                            \n"
+      "b           99f                           \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1         {v0.8h}, [%1], #16            \n"
+      "ld1         {v1.8h}, [%2], #16            \n"
+      "subs        %w3, %w3, #8                  \n"
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "urhadd      v0.8h, v0.8h, v1.8h           \n"
+      "prfm        pldl1keep, [%2, 448]          \n"
+      "ushl        v0.8h, v0.8h, v6.8h           \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "st1         {v0.8b}, [%0], #8             \n"
+      "b.gt        50b                           \n"
+      "b           99f                           \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ldr         q0, [%1], #16                 \n"
+      "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
+      "prfm        pldl1keep, [%1, 448]          \n"
+      "uqxtn       v0.8b, v0.8h                  \n"
+      "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
+      "str         d0, [%0], #8                  \n"  // store 8 pixels
+      "b.gt        100b                          \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_ptr1),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(y1_fraction),  // %4
+        "r"(y0_fraction),  // %5
+        "r"(shift)         // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+}
+
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
 void ARGBBlendRow_NEON(const uint8_t* src_argb,
                       const uint8_t* src_argb1,
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1605,6 +1605,12 @@ void ScalePlaneVertical_16(int src_height,
  }
 }

+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+// TODO(fbarchard): change scale to bits
 void ScalePlaneVertical_16To8(int src_height,
                              int dst_width,
                              int dst_height,
@ -1620,7 +1626,7 @@ void ScalePlaneVertical_16To8(int src_height,
                              enum FilterMode filtering) {
  // TODO(fbarchard): Allow higher wpp.
  int dst_width_words = dst_width * wpp;
-  // TODO(https://crbug.com/libyuv/931): Add NEON and AVX2 versions.
+  // TODO(https://crbug.com/libyuv/931): Add NEON 32 bit and AVX2 versions.
  void (*InterpolateRow_16To8)(uint8_t * dst_argb, const uint16_t* src_argb,
                               ptrdiff_t src_stride, int scale, int dst_width,
                               int source_y_fraction) = InterpolateRow_16To8_C;
@ -1632,6 +1638,22 @@ void ScalePlaneVertical_16To8(int src_height,
  assert(dst_height > 0);
  src_argb += (x >> 16) * wpp;

+#if defined(HAS_INTERPOLATEROW_16TO8_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow_16To8 = InterpolateRow_16To8_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_16TO8_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    InterpolateRow_16To8 = InterpolateRow_16To8_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow_16To8 = InterpolateRow_16To8_AVX2;
+    }
+  }
+#endif
  for (j = 0; j < dst_height; ++j) {
    int yi;
    int yf;