2x down sample for UV planes ported to SSSE3 / NEON

Bug: libuyv:838 Change-Id: Id9fb3282a3e86143d76b5e0cb557f0523a88b3c8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2465578 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2020-10-12 16:06:09 -07:00 · 2020-10-12 16:06:09 -07:00 · d730dc2f18
commit d730dc2f18
parent b6f3cff282
12 changed files with 334 additions and 144 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1765
+Version: 1766
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -77,6 +77,10 @@ int ARGBToAR30(const uint8_t* src_argb,
               int width,
               int height);

+// Aliases
+#define ABGRToRGB24 ARGBToRAW
+#define ABGRToRAW   ARGBToRGB24
+
 // Convert ARGB To RGB24.
 LIBYUV_API
 int ARGBToRGB24(const uint8_t* src_argb,
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -72,6 +72,13 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_SSSE3
 #endif

+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_SCALEUVROWDOWN2BOX_SSSE3
+#endif
+
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
@ -98,6 +105,11 @@ extern "C" {
 #define HAS_SCALEROWDOWN4_NEON
 #endif

+// The following are available on 64 bit Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALEUVROWDOWN2BOX_NEON
+#endif
+
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_SCALEADDROW_MSA
 #define HAS_SCALEARGBCOLS_MSA
@ -830,15 +842,15 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
                                     int dst_width);

 // UV Row functions
-void ScaleUVRowDown2_SSE2(const uint8_t* src_uv,
+void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
                          uint8_t* dst_uv,
                          int dst_width);
-void ScaleUVRowDown2Linear_SSE2(const uint8_t* src_uv,
+void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_uv,
                                int dst_width);
-void ScaleUVRowDown2Box_SSE2(const uint8_t* src_uv,
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_uv,
                             int dst_width);
@ -846,7 +858,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
                          uint8_t* dst,
                          int dst_width);
-void ScaleUVRowDown2Linear_NEON(const uint8_t* src_uv,
+void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_uv,
                                int dst_width);
@ -854,42 +866,42 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst,
                             int dst_width);
-void ScaleUVRowDown2_MSA(const uint8_t* src_uv,
+void ScaleUVRowDown2_MSA(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst_uv,
                         int dst_width);
-void ScaleUVRowDown2Linear_MSA(const uint8_t* src_uv,
+void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_uv,
                               int dst_width);
-void ScaleUVRowDown2Box_MSA(const uint8_t* src_uv,
+void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_uv,
                            int dst_width);
-void ScaleUVRowDown2_MMI(const uint8_t* src_uv,
+void ScaleUVRowDown2_MMI(const uint8_t* src_ptr,
                         ptrdiff_t src_stride,
                         uint8_t* dst_uv,
                         int dst_width);
-void ScaleUVRowDown2Linear_MMI(const uint8_t* src_uv,
+void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_uv,
                               int dst_width);
-void ScaleUVRowDown2Box_MMI(const uint8_t* src_uv,
+void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_uv,
                            int dst_width);
-void ScaleUVRowDown2_Any_SSE2(const uint8_t* src_ptr,
-                              ptrdiff_t src_stride,
-                              uint8_t* dst_ptr,
-                              int dst_width);
-void ScaleUVRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
-                                    ptrdiff_t src_stride,
-                                    uint8_t* dst_ptr,
-                                    int dst_width);
-void ScaleUVRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
-                                 ptrdiff_t src_stride,
-                                 uint8_t* dst_ptr,
-                                 int dst_width);
+void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
 void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
@ -926,52 +938,52 @@ void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width);
-void ScaleUVRowDownEven_SSE2(const uint8_t* src_uv,
+void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             int src_stepx,
                             uint8_t* dst_uv,
                             int dst_width);
-void ScaleUVRowDownEvenBox_SSE2(const uint8_t* src_uv,
+void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                int src_stepx,
                                uint8_t* dst_uv,
                                int dst_width);
-void ScaleUVRowDownEven_NEON(const uint8_t* src_uv,
+void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             int src_stepx,
                             uint8_t* dst_uv,
                             int dst_width);
-void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_uv,
+void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                int src_stepx,
                                uint8_t* dst_uv,
                                int dst_width);
-void ScaleUVRowDownEven_MSA(const uint8_t* src_uv,
+void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            int32_t src_stepx,
                            uint8_t* dst_uv,
                            int dst_width);
-void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_uv,
+void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               int src_stepx,
                               uint8_t* dst_uv,
                               int dst_width);
-void ScaleUVRowDownEven_MMI(const uint8_t* src_uv,
+void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            int32_t src_stepx,
                            uint8_t* dst_uv,
                            int dst_width);
-void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_uv,
+void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               int src_stepx,
                               uint8_t* dst_uv,
                               int dst_width);
-void ScaleUVRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 int src_stepx,
                                 uint8_t* dst_ptr,
                                 int dst_width);
-void ScaleUVRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
+void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr,
                                    ptrdiff_t src_stride,
                                    int src_stepx,
                                    uint8_t* dst_ptr,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1765
+#define LIBYUV_VERSION 1766

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -20,49 +20,6 @@ namespace libyuv {
 extern "C" {
 #endif

-// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
-#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
-               int dx) {                                                       \
-    int r = dst_width & MASK;                                                  \
-    int n = dst_width & ~MASK;                                                 \
-    if (n > 0) {                                                               \
-      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
-    }                                                                          \
-    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
-  }
-
-#ifdef HAS_SCALEFILTERCOLS_NEON
-CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
-#endif
-#ifdef HAS_SCALEFILTERCOLS_MSA
-CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
-#endif
-#ifdef HAS_SCALEARGBCOLS_NEON
-CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MSA
-CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
-#endif
-#ifdef HAS_SCALEARGBCOLS_MMI
-CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON,
-     ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C,
-     4,
-     3)
-#endif
-#ifdef HAS_SCALEARGBFILTERCOLS_MSA
-CANY(ScaleARGBFilterCols_Any_MSA,
-     ScaleARGBFilterCols_MSA,
-     ScaleARGBFilterCols_C,
-     4,
-     7)
-#endif
-#undef CANY
-
 // Fixed scale down.
 // Mask may be non-power of 2, so use MOD
 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
@ -113,6 +70,14 @@ SDODD(ScaleRowDown2Box_Odd_SSSE3,
      1,
      15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+SDANY(ScaleUVRowDown2Box_Any_SSSE3,
+      ScaleUVRowDown2Box_SSSE3,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      4)
+#endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
 SDANY(ScaleRowDown2Linear_Any_AVX2,
@ -155,6 +120,15 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
      1,
      15)
 #endif
+#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
+SDANY(ScaleUVRowDown2Box_Any_NEON,
+      ScaleUVRowDown2Box_NEON,
+      ScaleUVRowDown2Box_C,
+      2,
+      2,
+      8)
+#endif
+
 #ifdef HAS_SCALEROWDOWN2_MSA
 SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
 SDANY(ScaleRowDown2Linear_Any_MSA,
@ -577,6 +551,49 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)

 #endif  // SASIMDONLY

+// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
+#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
+
+#ifdef HAS_SCALEFILTERCOLS_NEON
+CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
+#endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
+#ifdef HAS_SCALEARGBCOLS_NEON
+CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
+#ifdef HAS_SCALEARGBCOLS_MMI
+CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_NEON
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
+#endif
+#undef CANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -1063,11 +1063,9 @@ void ScaleUVRowDown2Box_C(const uint8_t* src_uv,
  int x;
  for (x = 0; x < dst_width; ++x) {
    dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] +
-                 src_uv[src_stride + 2] + 2) >>
-                2;
+                 src_uv[src_stride + 2] + 2) >> 2;
    dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] +
-                 src_uv[src_stride + 3] + 2) >>
-                2;
+                 src_uv[src_stride + 3] + 2) >> 2;
    src_uv += 4;
    dst_uv += 2;
  }
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -1366,6 +1366,52 @@ int FixedDiv1_X86(int num, int div) {
  return num;
 }

+#ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3
+// Shuffle table for splitting UV into upper and lower part of register.
+static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
+                                      1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
+static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 6u, 14u,
+  0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+
+void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"  // 01010101
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5, %%xmm5                 \n"  // zero
+      "movdqa     %4,%%xmm1                      \n"  // split shuffler
+      "movdqa     %5,%%xmm3                      \n"  // merge shuffler
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"  // 8 UV row 0
+      "movdqu     0x00(%0,%3,1),%%xmm2           \n"  // 8 UV row 1
+      "lea        0x10(%0),%0                    \n"
+      "pshufb     %%xmm1,%%xmm0                  \n"  // uuuuvvvv
+      "pshufb     %%xmm1,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"  // horizontal add
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"  // vertical add
+      "psrlw      $0x1,%%xmm0                    \n"  // round
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pshufb     %%xmm3,%%xmm0                  \n"  // merge uv
+      "movq       %%xmm0,(%1)                    \n"
+      "lea        0x8(%1),%1                     \n"  // 4 UV
+      "sub        $0x4,%2                        \n"
+      "jg         1b                             \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride)), // %3
+        "m"(kShuffleSplitUV),        // %4
+        "m"(kShuffleMergeUV)         // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -950,6 +950,35 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,

 #undef LOAD2_DATA32_LANE

+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 8 UV pixels.
+      "vld2.8     {d1, d3}, [%0]!                \n"  // load next 8 UV
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vpaddl.u8  q0, q0                         \n"  // U 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // V 16 bytes -> 8 shorts.
+      "vld2.8     {d16, d18}, [%1]!              \n"  // load 8 more UV
+      "vld2.8     {d17, d19}, [%1]!              \n"  // load last 8 UV
+      "vpadal.u8  q0, q8                         \n"  // U 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q9                         \n"  // V 16 bytes -> 8 shorts.
+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vst2.8     {d0, d1}, [%2]!                \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q8", "q9");
+}
+
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

 #ifdef __cplusplus
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -1086,6 +1086,35 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
  );
 }

+void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 UV
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uaddlp     v0.8h, v0.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "ld2        {v16.16b,v17.16b}, [%1], #32   \n"  // load 16
+      "uadalp     v0.8h, v16.16b                 \n"  // U 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v17.16b                 \n"  // V 16 bytes -> 8 shorts.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "prfm       pldl1keep, [%1, 448]           \n"
+      "rshrn      v1.8b, v1.8h, #2               \n"
+      "st2        {v0.8b,v1.8b}, [%2], #16       \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v16", "v17");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/source/scale_uv.cc
+++ b/source/scale_uv.cc
@ -73,22 +73,40 @@ static void ScaleUVDown2(int src_width,
    src_uv += (y >> 16) * src_stride + ((x >> 16) - 1) * 2;
  }

-#if defined(HAS_SCALEUVROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleUVRowDown2 =
-        filtering == kFilterNone
-            ? ScaleUVRowDown2_Any_SSE2
-            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSE2
-                                          : ScaleUVRowDown2Box_Any_SSE2);
-    if (IS_ALIGNED(dst_width, 2)) {
-      ScaleUVRowDown2 =
-          filtering == kFilterNone
-              ? ScaleUVRowDown2_SSE2
-              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSE2
-                                            : ScaleUVRowDown2Box_SSE2);
+#if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
    }
  }
 #endif
+// This code is not enabled.  Only box filter is available at this time.
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleUVRowDown2 =
+        filtering == kFilterNone
+            ? ScaleUVRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3
+                                          : ScaleUVRowDown2Box_Any_SSSE3);
+    if (IS_ALIGNED(dst_width, 2)) {
+      ScaleUVRowDown2 =
+          filtering == kFilterNone
+              ? ScaleUVRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3
+                                            : ScaleUVRowDown2Box_SSSE3);
+    }
+  }
+#endif
+#if defined(HAS_SCALEUVROWDOWN2BOX_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && filtering) {
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON;
+    }
+  }
+#endif
+// This code is not enabled.  Only box filter is available at this time.
 #if defined(HAS_SCALEUVROWDOWN2_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ScaleUVRowDown2 =
@ -180,11 +198,11 @@ static void ScaleUVDown4Box(int src_width,
  (void)dx;
  assert(dx == 65536 * 4);      // Test scale factor of 4.
  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
-#if defined(HAS_SCALEUVROWDOWN2_SSE2)
+#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSE2;
+    ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
-      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSE2;
+      ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3;
    }
  }
 #endif
@ -237,13 +255,13 @@ static void ScaleUVDownEven(int src_width,
  assert(IS_ALIGNED(src_width, 2));
  assert(IS_ALIGNED(src_height, 2));
  src_uv += (y >> 16) * src_stride + (x >> 16) * 2;
-#if defined(HAS_SCALEUVROWDOWNEVEN_SSE2)
+#if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3)
  if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSE2
-                                   : ScaleUVRowDownEven_Any_SSE2;
+    ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3
+                                   : ScaleUVRowDownEven_Any_SSSE3;
    if (IS_ALIGNED(dst_width, 4)) {
      ScaleUVRowDownEven =
-          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSE2;
+          filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3;
    }
  }
 #endif
@ -494,9 +512,9 @@ static void ScaleUVBilinearUp(int src_width,
    }
  }
 #endif
-#if defined(HAS_SCALEUVCOLS_SSE2)
+#if defined(HAS_SCALEUVCOLS_SSSE3)
  if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleUVFilterCols = ScaleUVCols_SSE2;
+    ScaleUVFilterCols = ScaleUVCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEUVCOLS_NEON)
@ -525,9 +543,9 @@ static void ScaleUVBilinearUp(int src_width,
 #endif
  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
    ScaleUVFilterCols = ScaleUVColsUp2_C;
-#if defined(HAS_SCALEUVCOLSUP2_SSE2)
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleUVFilterCols = ScaleUVColsUp2_SSE2;
+      ScaleUVFilterCols = ScaleUVColsUp2_SSSE3;
    }
 #endif
 #if defined(HAS_SCALEUVCOLSUP2_MMI)
@ -612,9 +630,9 @@ static void ScaleUVSimple(int src_width,
                      int x, int dx) =
      (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C;
  (void)src_height;
-#if defined(HAS_SCALEUVCOLS_SSE2)
+#if defined(HAS_SCALEUVCOLS_SSSE3)
  if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
-    ScaleUVCols = ScaleUVCols_SSE2;
+    ScaleUVCols = ScaleUVCols_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEUVCOLS_NEON)
@ -643,9 +661,9 @@ static void ScaleUVSimple(int src_width,
 #endif
  if (src_width * 2 == dst_width && x < 0x8000) {
    ScaleUVCols = ScaleUVColsUp2_C;
-#if defined(HAS_SCALEUVCOLSUP2_SSE2)
+#if defined(HAS_SCALEUVCOLSUP2_SSSE3)
    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-      ScaleUVCols = ScaleUVColsUp2_SSE2;
+      ScaleUVCols = ScaleUVColsUp2_SSSE3;
    }
 #endif
 #if defined(HAS_SCALEUVCOLSUP2_MMI)
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -1114,6 +1114,8 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1)
 TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1)
 TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1)
 TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1)
+TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
 TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1)
 #endif
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@ -14,7 +14,6 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale_uv.h"
-#include "libyuv/video_common.h"

 namespace libyuv {

@ -23,13 +22,13 @@ namespace libyuv {

 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int UVTestFilter(int src_width,
-                        int src_height,
-                        int dst_width,
-                        int dst_height,
-                        FilterMode f,
-                        int benchmark_iterations,
-                        int disable_cpu_flags,
-                        int benchmark_cpu_info) {
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
    return 0;
  }
@ -47,7 +46,8 @@ static int UVTestFilter(int src_width,
  }
  MemRandomize(src_uv, src_uv_plane_size);

-  int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
+  int64_t dst_uv_plane_size =
+      (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
  int dst_stride_uv = (b * 2 + dst_width) * 2;

  align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
@ -61,28 +61,29 @@ static int UVTestFilter(int src_width,

  // Warm up both versions for consistent benchmarks.
  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
+            src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
+            dst_stride_uv, dst_width, dst_height, f);
  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
+            src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2,
+            dst_stride_uv, dst_width, dst_height, f);

  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
  double c_time = get_time();
-  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-          src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-          dst_width, dst_height, f);
+  UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
+            src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2,
+            dst_stride_uv, dst_width, dst_height, f);

  c_time = (get_time() - c_time);

  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
  double opt_time = get_time();
  for (i = 0; i < benchmark_iterations; ++i) {
-    UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
-            src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
-            dst_width, dst_height, f);
+    UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv,
+              src_width, src_height,
+              dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
+              dst_width, dst_height, f);
  }
  opt_time = (get_time() - opt_time) / benchmark_iterations;

@ -111,22 +112,56 @@ static int UVTestFilter(int src_width,
  return max_diff;
 }

-#define TEST_SCALETO1(name, width, height, filter, max_diff)                \
-  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {          \
-    int diff = UVTestFilter(benchmark_width_, benchmark_height_, width,     \
-                            height, kFilter##filter, benchmark_iterations_, \
-                            disable_cpu_flags_, benchmark_cpu_info_);       \
-    EXPECT_LE(diff, max_diff);                                              \
-  }                                                                         \
-  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {        \
-    int diff = UVTestFilter(width, height, Abs(benchmark_width_),           \
-                            Abs(benchmark_height_), kFilter##filter,        \
-                            benchmark_iterations_, disable_cpu_flags_,      \
-                            benchmark_cpu_info_);                           \
-    EXPECT_LE(diff, max_diff);                                              \
+// The following adjustments in dimensions ensure the scale factor will be
+// exactly achieved.
+#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
+#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
+
+#define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
+  TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) {                \
+    int diff = UVTestFilter(                                               \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
+        benchmark_cpu_info_);                                                \
+    EXPECT_LE(diff, max_diff);                                               \
  }

-/// Test scale to a specified size with all 3 filters.
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, nom, denom)         \
+  TEST_FACTOR1(name, None, nom, denom, 0)     \
+  TEST_FACTOR1(name, Linear, nom, denom, 3)   \
+  TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
+  TEST_FACTOR1(name, Box, nom, denom, 3)
+
+TEST_FACTOR(2, 1, 2)
+TEST_FACTOR(4, 1, 4)
+// TEST_FACTOR(8, 1, 8)  Disable for benchmark performance.
+TEST_FACTOR(3by4, 3, 4)
+TEST_FACTOR(3by8, 3, 8)
+TEST_FACTOR(3, 1, 3)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+#undef SX
+#undef DX
+
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {             \
+    int diff = UVTestFilter(benchmark_width_, benchmark_height_, width,      \
+                              height, kFilter##filter, benchmark_iterations_,  \
+                              disable_cpu_flags_, benchmark_cpu_info_);        \
+    EXPECT_LE(diff, max_diff);                                                 \
+  }                                                                            \
+  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {           \
+    int diff = UVTestFilter(width, height, Abs(benchmark_width_),            \
+                              Abs(benchmark_height_), kFilter##filter,         \
+                              benchmark_iterations_, disable_cpu_flags_,       \
+                              benchmark_cpu_info_);                            \
+    EXPECT_LE(diff, max_diff);                                                 \
+  }
+
+/// Test scale to a specified size with all 4 filters.
 #define TEST_SCALETO(name, width, height)       \
  TEST_SCALETO1(name, width, height, None, 0)   \
  TEST_SCALETO1(name, width, height, Linear, 3) \