Move Neon source to its own files.

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/860009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@396 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-07 09:16:48 +08:00 · 2012-10-09 00:05:29 +00:00 · 2012-10-09 00:05:29 +00:00 · 64ce0ab544
commit 64ce0ab544
parent 4807dea4e7
32 changed files with 1262 additions and 868 deletions
--- a/Android.mk
+++ b/Android.mk
@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_CFLAGS += -DLIBYUV_NEON
    LOCAL_SRC_FILES += \
        source/compare_neon.cc \
        source/rotate_neon.cc.neon \
-        source/row_neon.cc.neon
+        source/row_neon.cc.neon \
        source/scale_neon.cc
 endif
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 395
+Version: 396
 License: BSD
 License File: LICENSE
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -27,12 +27,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
              int width, int height,
              uint32 value);
 // Alias.
 #define I400ToI400 CopyPlane
 // Copy a plane of data (I420 to I400).
 LIBYUV_API
 void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert UYVY to I422.
 int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height);
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -66,6 +66,7 @@ extern "C" {
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
 #define HAS_SETROW_X86
 #define HAS_SPLITUV_SSE2
 #define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
@ -76,13 +77,13 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 // Effects
 #define HAS_ARGBMIRRORROW_SSSE3
 #define HAS_ARGBAFFINEROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBINTERPOLATEROW_SSSE3
 #define HAS_ARGBMIRRORROW_SSSE3
 #define HAS_ARGBQUANTIZEROW_SSE2
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADE_SSE2
@ -93,9 +94,9 @@ extern "C" {
 // The following are Windows only:
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_I422TORGBAROW_SSSE3
 #define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_RGBATOARGBROW_SSSE3
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
@ -105,36 +106,42 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_SSSE3_ONLY)
 #define HAS_MIRRORROW_SSE2
 #define HAS_ARGBATTENUATE_SSE2
 #define HAS_ARGBBLENDROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
 // The following are available on Neon platforms
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORROWUV_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
 #define HAS_I422TOABGRROW_NEON
 #define HAS_I422TOARGBROW_NEON
 #define HAS_I422TOBGRAROW_NEON
-#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TORAWROW_NEON
 #define HAS_I422TORGB24ROW_NEON
 #define HAS_I422TORGBAROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_MIRRORROW_NEON
-#define HAS_YUY2TOUVROW_NEON
+#define HAS_MIRRORROWUV_NEON
-#define HAS_YUY2TOYROW_NEON
+#define HAS_SETROW_NEON
 #define HAS_SPLITUV_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
 #define HAS_YUY2TOUV422ROW_NEON
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 // TODO(fbarchard): Hook these up to calling functions.
 #define HAS_ARGBTORGBAROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGBAROW_NEON
 #define HAS_BGRATOARGBROW_NEON
-#define HAS_RGBATOARGBROW_NEON
+#define HAS_NV12TOARGBROW_NEON
 #define HAS_NV21TOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGBATOARGBROW_NEON
 #endif
 #if defined(_MSC_VER) && !defined(__CLR_VER)
@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width);
 void I422ToRGB24Row_NEON(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
                         int width);
 void I422ToRAWRow_NEON(const uint8* y_buf,
                       const uint8* u_buf,
                       const uint8* v_buf,
                       uint8* rgb_buf,
                       int width);
 void NV12ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        int width);
 void NV21ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        int width);
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
 void SetRow8_X86(uint8* dst, uint32 v32, int count);
 void SetRows32_X86(uint8* dst, uint32 v32, int width,
                   int dst_stride, int height);
 void SetRow8_NEON(uint8* dst, uint32 v32, int count);
 void SetRows32_NEON(uint8* dst, uint32 v32, int width,
                    int dst_stride, int height);
 void SetRow8_C(uint8* dst, uint32 v32, int count);
 void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf,
                     const uint8* v_buf,
                     uint8* rgba_buf,
                     int width);
 void I422ToRGB24Row_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* rgb24_buf,
                      int width);
 void I422ToRAWRow_C(const uint8* y_buf,
                    const uint8* u_buf,
                    const uint8* v_buf,
                    uint8* raw_buf,
                    int width);
 void YToARGBRow_C(const uint8* y_buf,
                  uint8* rgb_buf,
@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
 void I422ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
 void I422ToBGRARow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
 void I422ToABGRRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
 void I422ToRGBARow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
 void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width);
 void I422ToRAWRow_Any_NEON(const uint8* y_buf,
                           const uint8* u_buf,
                           const uint8* v_buf,
                           uint8* rgb_buf,
                           int width);
 void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* uv_buf,
                            uint8* argb_buf,
                            int width);
 void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* uv_buf,
                            uint8* argb_buf,
                            int width);
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 395
+#define LIBYUV_VERSION 396
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -64,6 +64,7 @@
        # sources.
        'source/compare.cc',
        'source/compare_neon.cc',
        'source/convert.cc',
        'source/convert_argb.cc',
        'source/convert_from.cc',
@ -79,6 +80,7 @@
        'source/row_posix.cc',
        'source/row_win.cc',
        'source/scale.cc',
        'source/scale_neon.cc',
        'source/scale_argb.cc',
        'source/video_common.cc',
      ],
--- a/source/compare.cc
+++ b/source/compare.cc
@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  return seed;
 }
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SUMSQUAREERROR_NEON
-static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
                                  int count) {
  volatile uint32 sse;
  asm volatile (
    "vmov.u8    q7, #0                         \n"
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q8, #0                         \n"
    "vmov.u8    q10, #0                        \n"
    "1:                                        \n"
    "vld1.u8    {q0}, [%0]!                    \n"
    "vld1.u8    {q1}, [%1]!                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
    "vsubl.u8   q3, d1, d3                     \n"
    "vmlal.s16  q7, d4, d4                     \n"
    "vmlal.s16  q8, d6, d6                     \n"
    "vmlal.s16  q8, d5, d5                     \n"
    "vmlal.s16  q10, d7, d7                    \n"
    "subs       %2, %2, #16                    \n"
    "bgt        1b                             \n"
    "vadd.u32   q7, q7, q8                     \n"
    "vadd.u32   q9, q9, q10                    \n"
    "vadd.u32   q10, q7, q9                    \n"
    "vpaddl.u32 q1, q10                        \n"
    "vadd.u64   d0, d2, d3                     \n"
    "vmov.32    %3, d0[0]                      \n"
    : "+r"(src_a),
      "+r"(src_b),
      "+r"(count),
      "=r"(sse)
    :
    : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
  return sse;
 }
 #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SUMSQUAREERROR_SSE2
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@ -0,0 +1,62 @@
 /*
 *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
  volatile uint32 sse;
  asm volatile (
    "vmov.u8    q8, #0                         \n"
    "vmov.u8    q10, #0                        \n"
    "vmov.u8    q9, #0                         \n"
    "vmov.u8    q11, #0                        \n"
    ".p2align  2                               \n"
  "1:                                          \n"
    "vld1.u8    {q0}, [%0]!                    \n"
    "vld1.u8    {q1}, [%1]!                    \n"
    "subs       %2, %2, #16                    \n"
    "vsubl.u8   q2, d0, d2                     \n"
    "vsubl.u8   q3, d1, d3                     \n"
    "vmlal.s16  q8, d4, d4                     \n"
    "vmlal.s16  q9, d6, d6                     \n"
    "vmlal.s16  q10, d5, d5                    \n"
    "vmlal.s16  q11, d7, d7                    \n"
    "bgt        1b                             \n"
    "vadd.u32   q8, q8, q9                     \n"
    "vadd.u32   q10, q10, q11                  \n"
    "vadd.u32   q11, q8, q10                   \n"
    "vpaddl.u32 q1, q11                        \n"
    "vadd.u64   d0, d2, d3                     \n"
    "vmov.32    %3, d0[0]                      \n"
    : "+r"(src_a),
      "+r"(src_b),
      "+r"(count),
      "=r"(sse)
    :
    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
  return sse;
 }
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
--- a/source/convert.cc
+++ b/source/convert.cc
@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // Move to row_win etc.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_HALFROW_SSE2
 __declspec(naked) __declspec(align(16))
@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
 // Blends 32x2 pixels to 16x1
 // source in scale.cc
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_NV12TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      NV12ToARGBRow = NV12ToARGBRow_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
 // Convert NV21 to ARGB.
 LIBYUV_API
 int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
+               const uint8* src_uv, int src_stride_uv,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_y || !src_vu || !dst_argb ||
+  if (!src_y || !src_uv || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
  }
@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
    dst_stride_argb = -dst_stride_argb;
  }
  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* vu_buf,
+                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        int width) = NV21ToARGBRow_C;
 #if defined(HAS_NV21TOARGBROW_SSSE3)
@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
 #if defined(HAS_NV21TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      NV21ToARGBRow = NV21ToARGBRow_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, dst_argb, width);
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
-      src_vu += src_stride_vu;
+      src_uv += src_stride_uv;
    }
  }
  return 0;
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
 }
 // Convert I420 to RGB24.
 // TODO(fbarchard): One step I420ToRGB24Row_NEON.
 LIBYUV_API
 int I420ToRGB24(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-                uint8* dst_argb, int dst_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
                int width, int height) {
  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
+      !dst_rgb24 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
-    dst_stride_argb = -dst_stride_argb;
+    dst_stride_rgb24 = -dst_stride_rgb24;
  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
+  void (*I422ToRGB24Row)(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+                         int width) = I422ToRGB24Row_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TORGB24ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
-  }
+    if (IS_ALIGNED(width, 16)) {
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB24Row_C;
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    if (width * 3 <= kMaxStride) {
      ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
    }
    if (IS_ALIGNED(width, 16) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
    }
  }
-#endif
+#elif defined(HAS_I422TORGB24ROW_SSSE3)
-#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
    if (width * 3 <= kMaxStride) {
      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
    }
    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+      I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
        I422ToRGB24Row = I422ToRGB24Row_SSSE3;
      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
-    ARGBToRGB24Row(row, dst_argb, width);
+    dst_rgb24 += dst_stride_rgb24;
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
 }
 // Convert I420 to RAW.
 // TODO(fbarchard): One step I420ToRAWRow_NEON.
 LIBYUV_API
 int I420ToRAW(const uint8* src_y, int src_stride_y,
              const uint8* src_u, int src_stride_u,
              const uint8* src_v, int src_stride_v,
-              uint8* dst_argb, int dst_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
              int width, int height) {
  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
+      !dst_raw ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
-    dst_stride_argb = -dst_stride_argb;
+    dst_stride_raw = -dst_stride_raw;
  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
+  void (*I422ToRAWRow)(const uint8* y_buf,
                       const uint8* u_buf,
                       const uint8* v_buf,
                       uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+                       int width) = I422ToRAWRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+#if defined(HAS_I422TORAWROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
-  }
+    if (IS_ALIGNED(width, 16)) {
-#elif defined(HAS_I422TOARGBROW_SSSE3)
+      I422ToRAWRow = I422ToRAWRow_NEON;
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I422ToARGBRow = I422ToARGBRow_SSSE3;
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRAWRow_C;
 #if defined(HAS_ARGBTORAWROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    if (width * 3 <= kMaxStride) {
      ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
    }
    if (IS_ALIGNED(width, 16) &&
        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
    }
  }
-#elif defined(HAS_ARGBTORAWROW_NEON)
+#elif defined(HAS_I422TORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    if (width * 3 <= kMaxStride) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
    }
    if (IS_ALIGNED(width, 8)) {
-      ARGBToRAWRow = ARGBToRAWRow_NEON;
+      I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
      if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
        I422ToRAWRow = I422ToRAWRow_SSSE3;
      }
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
-    ARGBToRAWRow(row, dst_argb, width);
+    dst_raw += dst_stride_raw;
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -29,7 +29,7 @@
 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
 static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (
+  asm volatile (  // NOLINT
    "mov %%ebx, %%edi                          \n"
    "cpuid                                     \n"
    "xchg %%edi, %%ebx                         \n"
@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (
+  asm volatile (  // NOLINT
    "cpuid                                     \n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type));
@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) {
 #define HAS_XGETBV
 static uint32 XGetBV(unsigned int xcr) {
  uint32 xcr_feature_mask;
-  asm volatile (
+  asm volatile (  // NOLINT
    ".byte 0x0f, 0x01, 0xd0\n"
    : "=a"(xcr_feature_mask)
    : "c"(xcr)
@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) {
 LIBYUV_API
 int cpu_info_ = 0;
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
 static bool TestEnv(const char* name) {
  const char* var = getenv(name);
  if (var) {
    if (var[0] != '0') {
      return true;
    }
  }
  return false;
 }
 LIBYUV_API
 int InitCpuFlags(void) {
 #if !defined(__CLR_VER) && defined(CPU_X86)
@ -144,34 +156,33 @@ int InitCpuFlags(void) {
    }
  }
 #endif
  // environment variable overrides for testing.
-  if (getenv("LIBYUV_DISABLE_X86")) {
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
    cpu_info_ &= ~kCpuHasX86;
  }
-  if (getenv("LIBYUV_DISABLE_SSE2")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
    cpu_info_ &= ~kCpuHasSSE2;
  }
-  if (getenv("LIBYUV_DISABLE_SSSE3")) {
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
    cpu_info_ &= ~kCpuHasSSSE3;
  }
-  if (getenv("LIBYUV_DISABLE_SSE41")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
    cpu_info_ &= ~kCpuHasSSE41;
  }
-  if (getenv("LIBYUV_DISABLE_SSE42")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
    cpu_info_ &= ~kCpuHasSSE42;
  }
-  if (getenv("LIBYUV_DISABLE_AVX")) {
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
    cpu_info_ &= ~kCpuHasAVX;
  }
-  if (getenv("LIBYUV_DISABLE_AVX2")) {
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
    cpu_info_ &= ~kCpuHasAVX2;
  }
-  if (getenv("LIBYUV_DISABLE_ASM")) {
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
    cpu_info_ = kCpuInitialized;
  }
 #elif defined(__arm__)
-#if defined(__linux__) && defined(__ARM_NEON__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
  // linux arm parse text file for neon detect.
  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
 #elif defined(__ARM_NEON__)
@ -181,10 +192,10 @@ int InitCpuFlags(void) {
  cpu_info_ = kCpuHasNEON;
 #endif
  cpu_info_ |= kCpuInitialized | kCpuHasARM;
-  if (getenv("LIBYUV_DISABLE_NEON")) {
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
    cpu_info_ &= ~kCpuHasNEON;
  }
-  if (getenv("LIBYUV_DISABLE_ASM")) {
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
    cpu_info_ = kCpuInitialized;
  }
 #endif  // __arm__
--- a/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@ -10,6 +10,7 @@
 #include "libyuv/mjpeg_decoder.h"
 #ifdef HAVE_JPEG
 // Must be included before jpeglib
 #include <assert.h>
 #ifndef __CLR_VER
@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
 }
 }  // namespace libyuv
 #endif  // HAVE_JPEG
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
  }
 }
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
    src_stride_yuy2 = -src_stride_yuy2;
  }
  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
                      uint8* dst_u, uint8* dst_v, int pix);
  void (*YUY2ToYRow)(const uint8* src_yuy2,
                     uint8* dst_y, int pix);
  YUY2ToYRow = YUY2ToYRow_C;
  YUY2ToUV422Row = YUY2ToUV422Row_C;
 #if defined(HAS_YUY2TOYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    if (width > 16) {
      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
    }
    if (IS_ALIGNED(width, 16)) {
      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
          YUY2ToYRow = YUY2ToYRow_SSE2;
        }
      }
    }
  }
 #elif defined(HAS_YUY2TOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    if (width > 8) {
      YUY2ToYRow = YUY2ToYRow_Any_NEON;
      if (width > 16) {
        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
      }
    }
    if (IS_ALIGNED(width, 16)) {
      YUY2ToYRow = YUY2ToYRow_NEON;
      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
    YUY2ToYRow(src_yuy2, dst_y, width);
    src_yuy2 += src_stride_yuy2;
    dst_y += dst_stride_y;
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
  return 0;
 }
 // Convert UYVY to I422.
 LIBYUV_API
 int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
               uint8* dst_v, int dst_stride_v,
               int width, int height) {
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
    src_stride_uyvy = -src_stride_uyvy;
  }
  void (*UYVYToUV422Row)(const uint8* src_uyvy,
                      uint8* dst_u, uint8* dst_v, int pix);
  void (*UYVYToYRow)(const uint8* src_uyvy,
                     uint8* dst_y, int pix);
  UYVYToYRow = UYVYToYRow_C;
  UYVYToUV422Row = UYVYToUV422Row_C;
 #if defined(HAS_UYVYTOYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    if (width > 16) {
      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
      UYVYToYRow = UYVYToYRow_Any_SSE2;
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
        UYVYToUV422Row = UYVYToUV422Row_SSE2;
        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
          UYVYToYRow = UYVYToYRow_SSE2;
        }
      }
    }
  }
 #elif defined(HAS_UYVYTOYROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    if (width > 8) {
      UYVYToYRow = UYVYToYRow_Any_NEON;
      if (width > 16) {
        UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
      }
    }
    if (IS_ALIGNED(width, 16)) {
      UYVYToYRow = UYVYToYRow_NEON;
      UYVYToUV422Row = UYVYToUV422Row_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
    UYVYToYRow(src_uyvy, dst_y, width);
    src_uyvy += src_stride_uyvy;
    dst_y += dst_stride_y;
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
  return 0;
 }
 // Mirror I420 with optional flipping
 LIBYUV_API
 int I420Mirror(const uint8* src_y, int src_stride_y,
@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
  }
 #endif
 #if defined(HAS_NV12TOARGBROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
    NV12ToARGBRow = NV12ToARGBRow_NEON;
  }
 #endif
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // SetRow8 writes 'count' bytes using a 32 bit value repeated
 // SetRow32 writes 'count' words using a 32 bit value repeated
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 #define HAS_SETROW_NEON
 static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (  // NOLINT
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
    "1:                                        \n"
    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
    "vst1.u32  {q0}, [%0]!                     \n"  // store
    "bgt       1b                              \n"
    : "+r"(dst),   // %0
      "+r"(count)  // %1
    : "r"(v32)     // %2
    : "q0", "memory", "cc");
 }
 // TODO(fbarchard): Make fully assembler
 static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
                           int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
    SetRow8_NEON(dst, v32, width << 2);
    dst += dst_stride;
  }
 }
 #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SETROW_X86
 __declspec(naked) __declspec(align(16))
 static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  __asm {
    mov        edx, edi
    mov        edi, [esp + 4]   // dst
    mov        eax, [esp + 8]   // v32
    mov        ecx, [esp + 12]  // count
    shr        ecx, 2
    rep stosd
    mov        edi, edx
    ret
  }
 }
 __declspec(naked) __declspec(align(16))
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                         int dst_stride, int height) {
  __asm {
    push       esi
    push       edi
    push       ebp
    mov        edi, [esp + 12 + 4]   // dst
    mov        eax, [esp + 12 + 8]   // v32
    mov        ebp, [esp + 12 + 12]  // width
    mov        edx, [esp + 12 + 16]  // dst_stride
    mov        esi, [esp + 12 + 20]  // height
    lea        ecx, [ebp * 4]
    sub        edx, ecx             // stride - width * 4
    align      16
  convertloop:
    mov        ecx, ebp
    rep stosd
    add        edi, edx
    sub        esi, 1
    jg         convertloop
    pop        ebp
    pop        edi
    pop        esi
    ret
  }
 }
 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
 #define HAS_SETROW_X86
 static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
  size_t width_tmp = static_cast<size_t>(width);
  asm volatile (  // NOLINT
    "shr       $0x2,%1                         \n"
    "rep stosl                                 \n"
    : "+D"(dst),       // %0
      "+c"(width_tmp)  // %1
    : "a"(v32)         // %2
    : "memory", "cc");
 }
 static void SetRows32_X86(uint8* dst, uint32 v32, int width,
                         int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
    size_t width_tmp = static_cast<size_t>(width);
    uint32* d = reinterpret_cast<uint32*>(dst);
    asm volatile (  // NOLINT
      "rep stosl                               \n"
      : "+D"(d),         // %0
        "+c"(width_tmp)  // %1
      : "a"(v32)         // %2
      : "memory", "cc");
    dst += dst_stride;
  }
 }
 #endif
 static void SetRow8_C(uint8* dst, uint32 v8, int count) {
 #ifdef _MSC_VER
  for (int x = 0; x < count; ++x) {
    dst[x] = v8;
  }
 #else
  memset(dst, v8, count);
 #endif
 }
 static void SetRows32_C(uint8* dst, uint32 v32, int width,
                        int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
    uint32* d = reinterpret_cast<uint32*>(dst);
    for (int x = 0; x < width; ++x) {
      d[x] = v32;
    }
    dst += dst_stride;
  }
 }
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
              int width, int height,
@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
    SetRow = SetRow8_X86;
  }
 #endif
 #if defined(HAS_SETROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) &&
      IS_ALIGNED(width, 16) &&
      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
    SetRow = SetRow8_SSE2;
  }
 #endif
  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
  // Set plane
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
                                        (255u << ashift);
 }
 static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
                               uint8* b, uint8* g, uint8* r) {
  int32 y1 = (static_cast<int32>(y) - 16) * YG;
  *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
  *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
 }
 void I444ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf,
  }
 }
 void I422ToRGB24Row_C(const uint8* y_buf,
                      const uint8* u_buf,
                      const uint8* v_buf,
                      uint8* rgb_buf,
                      int width) {
  for (int x = 0; x < width - 1; x += 2) {
    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
    y_buf += 2;
    u_buf += 1;
    v_buf += 1;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
  }
 }
 void I422ToRAWRow_C(const uint8* y_buf,
                    const uint8* u_buf,
                    const uint8* v_buf,
                    uint8* rgb_buf,
                    int width) {
  for (int x = 0; x < width - 1; x += 2) {
    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
    y_buf += 2;
    u_buf += 1;
    v_buf += 1;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
  }
 }
 void I411ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }
 void SetRow8_C(uint8* dst, uint32 v8, int count) {
 #ifdef _MSC_VER
  // VC will generate rep stosb.
  for (int x = 0; x < count; ++x) {
    dst[x] = v8;
  }
 #else
  memset(dst, v8, count);
 #endif
 }
 void SetRows32_C(uint8* dst, uint32 v32, int width,
                 int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
    uint32* d = reinterpret_cast<uint32*>(dst);
    for (int x = 0; x < width; ++x) {
      d[x] = v32;
    }
    dst += dst_stride;
  }
 }
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int width) {
@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif
 #ifdef HAS_I422TORGB24ROW_SSSE3
 YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
     I422ToRGB24Row_C, 1)
 YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
 #endif
 #ifdef HAS_I422TORGBAROW_SSSE3
 YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
 #endif
@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
 YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
 YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
 Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
 Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
 YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
 YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
 #endif
 #undef YANY
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOARGBROW_NEON
@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOBGRAROW_NEON
@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOABGRROW_NEON
@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TORGBAROW_NEON
 #ifdef HAS_I422TORGB24ROW_NEON
 void I422ToRGB24Row_NEON(const uint8* y_buf,
                        const uint8* u_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
    "vld1.u8    {d25}, [%6]                    \n"
    "vmov.u8    d26, #128                      \n"
    "vmov.u16   q14, #74                       \n"
    "vmov.u16   q15, #16                       \n"
    ".p2align  2                               \n"
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
      "+r"(u_buf),    // %1
      "+r"(v_buf),    // %2
      "+r"(rgb_buf),  // %3
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TORGB24ROW_NEON
 #ifdef HAS_I422TORAWROW_NEON
 void I422ToRAWRow_NEON(const uint8* y_buf,
                       const uint8* u_buf,
                       const uint8* v_buf,
                       uint8* rgb_buf,
                       int width) {
  asm volatile (
    "vld1.u8    {d24}, [%5]                    \n"
    "vld1.u8    {d25}, [%6]                    \n"
    "vmov.u8    d26, #128                      \n"
    "vmov.u16   q14, #74                       \n"
    "vmov.u16   q15, #16                       \n"
    ".p2align  2                               \n"
  "1:                                          \n"
    READYUV422
    YUV422TORGB
    "subs       %4, %4, #8                     \n"
    "vswp.u8    d20, d22                       \n"
    "vst3.8     {d20, d21, d22}, [%3]!         \n"
    "bgt        1b                             \n"
    : "+r"(y_buf),    // %0
      "+r"(u_buf),    // %1
      "+r"(v_buf),    // %2
      "+r"(rgb_buf),  // %3
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
    : "cc", "memory", "q0", "q1", "q2", "q3",
      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TORAWROW_NEON
 #ifdef HAS_NV12TOARGBROW_NEON
 void NV12ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* uv_buf,
@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %3
    : "r"(&kUVToRB),  // %4
      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_NV12TOARGBROW_NEON
@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %3
    : "r"(&kUVToRB),  // %4
      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
+    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q12", "q13", "q14", "q15"
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_NV21TOARGBROW_NEON
@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_NEON
 #ifdef HAS_SETROW_NEON
-// SetRow8 writes 'count' bytes using a 32 bit value repeated
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
 void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (  // NOLINT
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
 }
 // TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated
+// SetRow32 writes 'count' words using a 32 bit value repeated.
 void SetRows32_NEON(uint8* dst, uint32 v32, int width,
                    int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
@ -358,7 +423,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    "sub         %1, #16                       \n"
    // the loop needs to run on blocks of 16. what will be left
    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
+    // to be done, or 0. If this isn't subtracted off here the
    // loop will run one extra time.
    "sub         %2, #16                       \n"
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_COPYROW_X86
 #ifdef HAS_SETROW_X86
 void SetRow8_X86(uint8* dst, uint32 v32, int width) {
  size_t width_tmp = static_cast<size_t>(width);
  asm volatile (
    "shr       $0x2,%1                         \n"
    "rep stosl                                 \n"
    : "+D"(dst),       // %0
      "+c"(width_tmp)  // %1
    : "a"(v32)         // %2
    : "memory", "cc");
 }
 void SetRows32_X86(uint8* dst, uint32 v32, int width,
                   int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
    size_t width_tmp = static_cast<size_t>(width);
    uint32* d = reinterpret_cast<uint32*>(dst);
    asm volatile (
      "rep stosl                               \n"
      : "+D"(d),         // %0
        "+c"(width_tmp)  // %1
      : "a"(v32)         // %2
      : "memory", "cc");
    dst += dst_stride;
  }
 }
 #endif  // HAS_SETROW_X86
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  asm volatile (
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -18,6 +18,7 @@ extern "C" {
 // This module is for Visual C x86.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 // TODO(fbarchard): I420ToRGB24, I420ToRAW
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Constants for ARGB.
@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_X86
 #ifdef HAS_SETROW_X86
 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
 __declspec(naked) __declspec(align(16))
 void SetRow8_X86(uint8* dst, uint32 v32, int count) {
  __asm {
    mov        edx, edi
    mov        edi, [esp + 4]   // dst
    mov        eax, [esp + 8]   // v32
    mov        ecx, [esp + 12]  // count
    shr        ecx, 2
    rep stosd
    mov        edi, edx
    ret
  }
 }
 // SetRow32 writes 'count' words using a 32 bit value repeated.
 __declspec(naked) __declspec(align(16))
 void SetRows32_X86(uint8* dst, uint32 v32, int width,
                   int dst_stride, int height) {
  __asm {
    push       esi
    push       edi
    push       ebp
    mov        edi, [esp + 12 + 4]   // dst
    mov        eax, [esp + 12 + 8]   // v32
    mov        ebp, [esp + 12 + 12]  // width
    mov        edx, [esp + 12 + 16]  // dst_stride
    mov        esi, [esp + 12 + 20]  // height
    lea        ecx, [ebp * 4]
    sub        edx, ecx             // stride - width * 4
    align      16
  convertloop:
    mov        ecx, ebp
    rep stosd
    add        edi, edx
    sub        esi, 1
    jg         convertloop
    pop        ebp
    pop        edi
    pop        esi
    ret
  }
 }
 #endif  // HAS_SETROW_X86
 #ifdef HAS_YUY2TOYROW_SSE2
 __declspec(naked) __declspec(align(16))
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
--- a/source/scale.cc
+++ b/source/scale.cc
@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) {
 #define HAS_SCALEROWDOWN2_NEON
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst, int dst_width) {
+                        uint8* dst, int dst_width);
  asm volatile (
    "1:                                        \n"
    // load even pixels into q0, odd into q1
    "vld2.u8    {q0,q1}, [%0]!                 \n"
    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst),              // %1
      "+r"(dst_width)         // %2
    :
    : "q0", "q1"              // Clobber List
  );
 }
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
+                           uint8* dst, int dst_width);
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
    "1:                                        \n"
    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
    "vpaddl.u8  q1, q1                         \n"
    // row 2 add adjacent, add row 1 to row 2
    "vpadal.u8  q0, q2                         \n"
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
    "vst1.u8    {q0}, [%2]!                    \n"
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(src_stride),       // %1
      "+r"(dst),              // %2
      "+r"(dst_width)         // %3
    :
    : "q0", "q1", "q2", "q3"     // Clobber List
   );
 }
 #define HAS_SCALEROWDOWN4_NEON
-static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                               uint8* dst_ptr, int dst_width) {
+                        uint8* dst_ptr, int dst_width);
-  asm volatile (
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-    "1:                                        \n"
+                           uint8* dst_ptr, int dst_width);
    "vld2.u8    {d0, d1}, [%0]!                \n"
    "vtrn.u8    d1, d0                         \n"
    "vshrn.u16  d0, q0, #8                     \n"
    "vst1.u32   {d0[1]}, [%1]!                 \n"
    "subs       %2, #4                         \n"
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    :
    : "q0", "q1", "memory", "cc"
  );
 }
 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                  uint8* dst_ptr, int dst_width) {
  asm volatile (
    "add        r4, %0, %3                     \n"
    "add        r5, r4, %3                     \n"
    "add        %3, r5, %3                     \n"
    "1:                                        \n"
    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
    "vld1.u8    {q1}, [r4]!                    \n"
    "vld1.u8    {q2}, [r5]!                    \n"
    "vld1.u8    {q3}, [%3]!                    \n"
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
    "vpadal.u8  q0, q2                         \n"
    "vpadal.u8  q0, q3                         \n"
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
    "vst1.u32   {d0[0]}, [%1]!                 \n"
    "subs       %2, #4                         \n"
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    : "r"(src_stride)         // %3
    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
  );
 }
 #define HAS_SCALEROWDOWN34_NEON
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-static void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8* src_ptr,
                         ptrdiff_t /* src_stride */,
-                                uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr, int dst_width);
-  asm volatile (
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vmov         d2, d3                       \n" // order d0, d1, d2
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    :
    : "d0", "d1", "d2", "d3", "memory", "cc"
  );
 }
 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr, int dst_width);
-  asm volatile (
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
    "vmov.u8      d24, #3                      \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
    // when adding lines together
    "vmovl.u8     q8, d4                       \n"
    "vmovl.u8     q9, d5                       \n"
    "vmovl.u8     q10, d6                      \n"
    "vmovl.u8     q11, d7                      \n"
    // 3 * line_0 + line_1
    "vmlal.u8     q8, d0, d24                  \n"
    "vmlal.u8     q9, d1, d24                  \n"
    "vmlal.u8     q10, d2, d24                 \n"
    "vmlal.u8     q11, d3, d24                 \n"
    // (3 * line_0 + line_1) >> 2
    "vqrshrn.u16  d0, q8, #2                   \n"
    "vqrshrn.u16  d1, q9, #2                   \n"
    "vqrshrn.u16  d2, q10, #2                  \n"
    "vqrshrn.u16  d3, q11, #2                  \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q8, d1                       \n"
    "vmlal.u8     q8, d0, d24                  \n"
    "vqrshrn.u16  d0, q8, #2                   \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q8, d2                       \n"
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
  );
 }
 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr, int dst_width);
  asm volatile (
    "vmov.u8      d24, #3                      \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    // average src line 0 with src line 1
    "vrhadd.u8    q0, q0, q2                   \n"
    "vrhadd.u8    q1, q1, q3                   \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q3, d1                       \n"
    "vmlal.u8     q3, d0, d24                  \n"
    "vqrshrn.u16  d0, q3, #2                   \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q3, d2                       \n"
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
  );
 }
 #define HAS_SCALEROWDOWN38_NEON
 const uvec8 kShuf38 =
  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
 const uvec8 kShuf38_2 =
  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
 const vec16 kMult38_Div6 =
  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
 const vec16 kMult38_Div9 =
  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
 // 32 -> 12
-static void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8* src_ptr,
                         ptrdiff_t /* src_stride */,
-                                uint8* dst_ptr, int dst_width) {
+                         uint8* dst_ptr, int dst_width);
  asm volatile (
    "vld1.u8      {q3}, [%3]                   \n"
    "1:                                        \n"
    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
    "vst1.u8      {d4}, [%1]!                  \n"
    "vst1.u32     {d5[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    : "r"(&kShuf38)           // %3
    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
  );
 }
 // 32x3 -> 12x1
-static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
-                                             uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr, int dst_width);
  asm volatile (
    "vld1.u16     {q13}, [%4]                  \n"
    "vld1.u8      {q14}, [%5]                  \n"
    "vld1.u8      {q15}, [%6]                  \n"
    "add          r4, %0, %3, lsl #1           \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    "vtrn.u8      d16, d17                     \n"
    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    "vtrn.u8      d18, d19                     \n"
    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    "vpaddl.u8    q8, q8                       \n"
    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    "vpaddl.u8    d19, d19                     \n"
    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     q0, q8                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    "vadd.u16     d4, d19                      \n"
    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    //             + s[6 + st * 1] + s[7 + st * 1]
    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    "vqrdmulh.s16 q2, q2, q13                  \n"
    "vmovn.u16    d4, q2                       \n"
    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg.  This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded.  Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    "vmovl.u8     q9, d18                      \n"
    // combine source lines
    "vadd.u16     q1, q3                       \n"
    "vadd.u16     q1, q9                       \n"
    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"
    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"
    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"
    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2.  So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q15                  \n"
    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    "vst1.u8      {d3}, [%1]!                  \n"
    "vst1.u32     {d4[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    : "r"(&kMult38_Div6),     // %4
      "r"(&kShuf38_2),        // %5
      "r"(&kMult38_Div9)      // %6
    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
      "q13", "q14", "q15", "memory", "cc"
  );
 }
 // 32x2 -> 12x1
-static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
+                               uint8* dst_ptr, int dst_width);
  asm volatile (
    "vld1.u16     {q13}, [%4]                  \n"
    "vld1.u8      {q14}, [%5]                  \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    "vqrshrn.u16  d4, q2, #2                   \n"
    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg.  This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded.  Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    // combine source lines
    "vadd.u16     q1, q3                       \n"
    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"
    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"
    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"
    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2.  So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q13                  \n"
    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    "vst1.u8      {d3}, [%1]!                  \n"
    "vst1.u32     {d4[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),       // %0
      "+r"(dst_ptr),       // %1
      "+r"(dst_width),     // %2
      "+r"(src_stride)     // %3
    : "r"(&kMult38_Div6),  // %4
      "r"(&kShuf38_2)      // %5
    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
  );
 }
 // 16x2 -> 16x1
 #define HAS_SCALEFILTERROWS_NEON
-static void ScaleFilterRows_NEON(uint8* dst_ptr,
+void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
-                                 int dst_width, int source_y_fraction) {
+                          int dst_width, int source_y_fraction);
  asm volatile (
    "cmp          %4, #0                       \n"
    "beq          2f                           \n"
    "add          %2, %1                       \n"
    "cmp          %4, #128                     \n"
    "beq          3f                           \n"
    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
    "1:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #16                      \n"
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            4f                           \n"
    "2:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "subs         %3, #16                      \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          2b                           \n"
    "b            4f                           \n"
    "3:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #16                      \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          3b                           \n"
    "4:                                        \n"
    "vst1.u8      {d1[7]}, [%0]                \n"
    : "+r"(dst_ptr),          // %0
      "+r"(src_ptr),          // %1
      "+r"(src_stride),       // %2
      "+r"(dst_width),        // %3
      "+r"(source_y_fraction) // %4
    :
    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
  );
 }
 /**
 * SSE2 downscalers with interpolation.
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -0,0 +1,534 @@
 /*
 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/basic_types.h"
 #include "libyuv/row.h"
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 // This module is for GCC Neon
 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
 /**
 * NEON downscalers with interpolation.
 *
 * Provided by Fritz Koenig
 *
 */
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                        uint8* dst, int dst_width) {
  asm volatile (
    "1:                                        \n"
    // load even pixels into q0, odd into q1
    "vld2.u8    {q0,q1}, [%0]!                 \n"
    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
    "subs       %2, %2, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst),              // %1
      "+r"(dst_width)         // %2
    :
    : "q0", "q1"              // Clobber List
  );
 }
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width) {
  asm volatile (
    // change the stride to row 2 pointer
    "add        %1, %0                         \n"
    "1:                                        \n"
    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
    "vpaddl.u8  q1, q1                         \n"
    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
    "vpadal.u8  q1, q3                         \n"
    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
    "vrshrn.u16 d1, q1, #2                     \n"
    "vst1.u8    {q0}, [%2]!                    \n"
    "subs       %3, %3, #16                    \n"  // 16 processed per loop
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(src_stride),       // %1
      "+r"(dst),              // %2
      "+r"(dst_width)         // %3
    :
    : "q0", "q1", "q2", "q3"     // Clobber List
   );
 }
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                        uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
    "vld2.u8    {d0, d1}, [%0]!                \n"
    "vtrn.u8    d1, d0                         \n"
    "vshrn.u16  d0, q0, #8                     \n"
    "vst1.u32   {d0[1]}, [%1]!                 \n"
    "subs       %2, #4                         \n"
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    :
    : "q0", "q1", "memory", "cc"
  );
 }
 void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst_ptr, int dst_width) {
  asm volatile (
    "add        r4, %0, %3                     \n"
    "add        r5, r4, %3                     \n"
    "add        %3, r5, %3                     \n"
    "1:                                        \n"
    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
    "vld1.u8    {q1}, [r4]!                    \n"
    "vld1.u8    {q2}, [r5]!                    \n"
    "vld1.u8    {q3}, [%3]!                    \n"
    "vpaddl.u8  q0, q0                         \n"
    "vpadal.u8  q0, q1                         \n"
    "vpadal.u8  q0, q2                         \n"
    "vpadal.u8  q0, q3                         \n"
    "vpaddl.u16 q0, q0                         \n"
    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
    "vmovn.u16  d0, q0                         \n"
    "vst1.u32   {d0[0]}, [%1]!                 \n"
    "subs       %2, #4                         \n"
    "bgt        1b                             \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    : "r"(src_stride)         // %3
    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
  );
 }
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
 void ScaleRowDown34_NEON(const uint8* src_ptr,
                         ptrdiff_t /* src_stride */,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vmov         d2, d3                       \n" // order d0, d1, d2
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    :
    : "d0", "d1", "d2", "d3", "memory", "cc"
  );
 }
 void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vmov.u8      d24, #3                      \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
    // when adding lines together
    "vmovl.u8     q8, d4                       \n"
    "vmovl.u8     q9, d5                       \n"
    "vmovl.u8     q10, d6                      \n"
    "vmovl.u8     q11, d7                      \n"
    // 3 * line_0 + line_1
    "vmlal.u8     q8, d0, d24                  \n"
    "vmlal.u8     q9, d1, d24                  \n"
    "vmlal.u8     q10, d2, d24                 \n"
    "vmlal.u8     q11, d3, d24                 \n"
    // (3 * line_0 + line_1) >> 2
    "vqrshrn.u16  d0, q8, #2                   \n"
    "vqrshrn.u16  d1, q9, #2                   \n"
    "vqrshrn.u16  d2, q10, #2                  \n"
    "vqrshrn.u16  d3, q11, #2                  \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q8, d1                       \n"
    "vmlal.u8     q8, d0, d24                  \n"
    "vqrshrn.u16  d0, q8, #2                   \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q8, d2                       \n"
    "vmlal.u8     q8, d3, d24                  \n"
    "vqrshrn.u16  d2, q8, #2                   \n"
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
  );
 }
 void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vmov.u8      d24, #3                      \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
    // average src line 0 with src line 1
    "vrhadd.u8    q0, q0, q2                   \n"
    "vrhadd.u8    q1, q1, q3                   \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q3, d1                       \n"
    "vmlal.u8     q3, d0, d24                  \n"
    "vqrshrn.u16  d0, q3, #2                   \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2                   \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q3, d2                       \n"
    "vmlal.u8     q3, d3, d24                  \n"
    "vqrshrn.u16  d2, q3, #2                   \n"
    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
    "subs         %2, #24                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
  );
 }
 #define HAS_SCALEROWDOWN38_NEON
 const uvec8 kShuf38 =
  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
 const uvec8 kShuf38_2 =
  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
 const vec16 kMult38_Div6 =
  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
 const vec16 kMult38_Div9 =
  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
 // 32 -> 12
 void ScaleRowDown38_NEON(const uint8* src_ptr,
                         ptrdiff_t /* src_stride */,
                         uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vld1.u8      {q3}, [%3]                   \n"
    "1:                                        \n"
    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
    "vst1.u8      {d4}, [%1]!                  \n"
    "vst1.u32     {d5[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    : "r"(&kShuf38)           // %3
    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
  );
 }
 // 32x3 -> 12x1
 void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
                                      ptrdiff_t src_stride,
                                      uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vld1.u16     {q13}, [%4]                  \n"
    "vld1.u8      {q14}, [%5]                  \n"
    "vld1.u8      {q15}, [%6]                  \n"
    "add          r4, %0, %3, lsl #1           \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    "vtrn.u8      d16, d17                     \n"
    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    "vtrn.u8      d18, d19                     \n"
    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    "vpaddl.u8    q8, q8                       \n"
    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    "vpaddl.u8    d19, d19                     \n"
    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     q0, q8                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    "vadd.u16     d4, d19                      \n"
    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
    //             + s[6 + st * 1] + s[7 + st * 1]
    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
    "vqrdmulh.s16 q2, q2, q13                  \n"
    "vmovn.u16    d4, q2                       \n"
    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    "vmovl.u8     q9, d18                      \n"
    // combine source lines
    "vadd.u16     q1, q3                       \n"
    "vadd.u16     q1, q9                       \n"
    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"
    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"
    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"
    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q15                  \n"
    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    "vst1.u8      {d3}, [%1]!                  \n"
    "vst1.u32     {d4[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    : "r"(&kMult38_Div6),     // %4
      "r"(&kShuf38_2),        // %5
      "r"(&kMult38_Div9)      // %6
    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
      "q13", "q14", "q15", "memory", "cc"
  );
 }
 // 32x2 -> 12x1
 void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
                               ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width) {
  asm volatile (
    "vld1.u16     {q13}, [%4]                  \n"
    "vld1.u8      {q14}, [%5]                  \n"
    "add          %3, %0                       \n"
    "1:                                        \n"
    // d0 = 00 40 01 41 02 42 03 43
    // d1 = 10 50 11 51 12 52 13 53
    // d2 = 20 60 21 61 22 62 23 63
    // d3 = 30 70 31 71 32 72 33 73
    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
    // Shuffle the input data around to get align the data
    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
    // d0 = 00 10 01 11 02 12 03 13
    // d1 = 40 50 41 51 42 52 43 53
    "vtrn.u8      d0, d1                       \n"
    "vtrn.u8      d4, d5                       \n"
    // d2 = 20 30 21 31 22 32 23 33
    // d3 = 60 70 61 71 62 72 63 73
    "vtrn.u8      d2, d3                       \n"
    "vtrn.u8      d6, d7                       \n"
    // d0 = 00+10 01+11 02+12 03+13
    // d2 = 40+50 41+51 42+52 43+53
    "vpaddl.u8    q0, q0                       \n"
    "vpaddl.u8    q2, q2                       \n"
    // d3 = 60+70 61+71 62+72 63+73
    "vpaddl.u8    d3, d3                       \n"
    "vpaddl.u8    d7, d7                       \n"
    // combine source lines
    "vadd.u16     q0, q2                       \n"
    "vadd.u16     d4, d3, d7                   \n"
    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
    "vqrshrn.u16  d4, q2, #2                   \n"
    // Shuffle 2,3 reg around so that 2 can be added to the
    //  0,1 reg and 3 can be added to the 4,5 reg. This
    //  requires expanding from u8 to u16 as the 0,1 and 4,5
    //  registers are already expanded. Then do transposes
    //  to get aligned.
    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
    "vmovl.u8     q1, d2                       \n"
    "vmovl.u8     q3, d6                       \n"
    // combine source lines
    "vadd.u16     q1, q3                       \n"
    // d4 = xx 20 xx 30 xx 22 xx 32
    // d5 = xx 21 xx 31 xx 23 xx 33
    "vtrn.u32     d2, d3                       \n"
    // d4 = xx 20 xx 21 xx 22 xx 23
    // d5 = xx 30 xx 31 xx 32 xx 33
    "vtrn.u16     d2, d3                       \n"
    // 0+1+2, 3+4+5
    "vadd.u16     q0, q1                       \n"
    // Need to divide, but can't downshift as the the value
    //  isn't a power of 2. So multiply by 65536 / n
    //  and take the upper 16 bits.
    "vqrdmulh.s16 q0, q0, q13                  \n"
    // Align for table lookup, vtbl requires registers to
    //  be adjacent
    "vmov.u8      d2, d4                       \n"
    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
    "vst1.u8      {d3}, [%1]!                  \n"
    "vst1.u32     {d4[0]}, [%1]!               \n"
    "subs         %2, #12                      \n"
    "bgt          1b                           \n"
    : "+r"(src_ptr),       // %0
      "+r"(dst_ptr),       // %1
      "+r"(dst_width),     // %2
      "+r"(src_stride)     // %3
    : "r"(&kMult38_Div6),  // %4
      "r"(&kShuf38_2)      // %5
    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
  );
 }
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
  asm volatile (
    "cmp          %4, #0                       \n"
    "beq          2f                           \n"
    "add          %2, %1                       \n"
    "cmp          %4, #128                     \n"
    "beq          3f                           \n"
    "vdup.8       d5, %4                       \n"
    "rsb          %4, #256                     \n"
    "vdup.8       d4, %4                       \n"
    "1:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #16                      \n"
    "vmull.u8     q13, d0, d4                  \n"
    "vmull.u8     q14, d1, d4                  \n"
    "vmlal.u8     q13, d2, d5                  \n"
    "vmlal.u8     q14, d3, d5                  \n"
    "vrshrn.u16   d0, q13, #8                  \n"
    "vrshrn.u16   d1, q14, #8                  \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          1b                           \n"
    "b            4f                           \n"
    "2:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "subs         %3, #16                      \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          2b                           \n"
    "b            4f                           \n"
    "3:                                        \n"
    "vld1.u8      {q0}, [%1]!                  \n"
    "vld1.u8      {q1}, [%2]!                  \n"
    "subs         %3, #16                      \n"
    "vrhadd.u8    q0, q1                       \n"
    "vst1.u8      {q0}, [%0]!                  \n"
    "bgt          3b                           \n"
    "4:                                        \n"
    "vst1.u8      {d1[7]}, [%0]                \n"
    : "+r"(dst_ptr),          // %0
      "+r"(src_ptr),          // %1
      "+r"(src_stride),       // %2
      "+r"(dst_width),        // %3
      "+r"(source_y_fraction) // %4
    :
    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
  );
 }
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif