Move Neon source to its own files.

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/860009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@396 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2012-10-09 00:05:29 +00:00 · 2012-10-09 00:05:29 +00:00 · 64ce0ab544
commit 64ce0ab544
parent 4807dea4e7
32 changed files with 1262 additions and 868 deletions
--- a/Android.mk
+++ b/Android.mk
@ -22,8 +22,10 @@ LOCAL_SRC_FILES := \
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
    LOCAL_CFLAGS += -DLIBYUV_NEON
    LOCAL_SRC_FILES += \
+        source/compare_neon.cc \
        source/rotate_neon.cc.neon \
-        source/row_neon.cc.neon
+        source/row_neon.cc.neon \
+        source/scale_neon.cc
 endif

 LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 395
+Version: 396
 License: BSD
 License File: LICENSE

--- a/include/libyuv/compare.h
+++ b/include/libyuv/compare.h
@ -18,7 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif

-// Compute a hash for specified memory.  Seed of 5381 recommended.
+// Compute a hash for specified memory. Seed of 5381 recommended.
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -73,7 +73,7 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Convert NV12 to I420.  Also used for NV21.
+// Convert NV12 to I420. Also used for NV21.
 LIBYUV_API
 int NV12ToI420(const uint8* src_y, int src_stride_y,
               const uint8* src_uv, int src_stride_uv,
@ -229,7 +229,7 @@ int MJPGToI420(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc.  ie 'I420', 'YUY2'
+// "format" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
 int ConvertToI420(const uint8* src_frame, size_t src_size,
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@ -19,7 +19,7 @@

 // TODO(fbarchard): This set of functions should exactly match convert.h
 // Add missing V210 and Q420.
-// TODO(fbarchard): Add tests.  Create random content of right size and convert
+// TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.

@ -75,7 +75,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height);

-// Convert I400 to ARGB.  Reverse of ARGBToI400.
+// Convert I400 to ARGB. Reverse of ARGBToI400.
 LIBYUV_API
 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                         uint8* dst_argb, int dst_stride_argb,
@ -209,7 +209,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc.  ie 'I420', 'YUY2'
+// "format" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
 int ConvertToARGB(const uint8* src_frame, size_t src_size,
--- a/include/libyuv/convert_from.h
+++ b/include/libyuv/convert_from.h
@ -50,7 +50,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
               uint8* dst_v, int dst_stride_v,
               int width, int height);

-// Copy to I400.  Source can be I420, I422, I444, I400, NV12 or NV21.
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
 LIBYUV_API
 int I400Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_y, int dst_stride_y,
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@ -58,7 +58,7 @@ static __inline int TestCpuFlag(int test_flag) {
 LIBYUV_API
 void MaskCpuFlags(int enable_flags);

-// Low level cpuid for X86.  Returns zeros on other CPUs.
+// Low level cpuid for X86. Returns zeros on other CPUs.
 LIBYUV_API
 void CpuId(int cpu_info[4], int info_type);

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -27,13 +27,31 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
              int width, int height,
              uint32 value);

+// Alias.
+#define I400ToI400 CopyPlane
+
 // Copy a plane of data (I420 to I400).
 LIBYUV_API
 void CopyPlane(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
               int width, int height);

-// Convert I420 to I400.  (calls CopyPlane ignoring u/v).
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert UYVY to I422.
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
 int I420ToI400(const uint8* src_y, int src_stride_y,
               uint8* dst_y, int dst_stride_y,
@ -196,7 +214,7 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
                   const uint8* table_argb,
                   int x, int y, int width, int height);

-// Quantize a rectangle of ARGB.  Alpha unaffected.
+// Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
 // interval_offset should be a value between 0 and 255.
@ -261,7 +279,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
               int w, int h, int dw, int dh);

 // Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry.  Used by ARGBBlur.
+// of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
 int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
                             int32* dst_cumsum, int dst_stride32_cumsum,
@ -299,7 +317,7 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
 #define YUV_DISABLE_ASM
 #endif
 // Row functions for copying a pixels from a source with a slope to a row
-// of destination.  Useful for scaling, rotation, mirror, texture mapping.
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
 void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
                     uint8* dst_argb, const float* uv_dudv, int width);
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -66,6 +66,7 @@ extern "C" {
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
+#define HAS_SETROW_X86
 #define HAS_SPLITUV_SSE2
 #define HAS_UYVYTOUV422ROW_SSE2
 #define HAS_UYVYTOUVROW_SSE2
@ -76,13 +77,13 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2

 // Effects
-#define HAS_ARGBMIRRORROW_SSSE3
 #define HAS_ARGBAFFINEROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBBLENDROW_SSSE3
 #define HAS_ARGBCOLORMATRIXROW_SSSE3
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBINTERPOLATEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
 #define HAS_ARGBQUANTIZEROW_SSE2
 #define HAS_ARGBSEPIAROW_SSSE3
 #define HAS_ARGBSHADE_SSE2
@ -93,9 +94,9 @@ extern "C" {

 // The following are Windows only:
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
+#define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_ARGBCOLORTABLEROW_X86
 #define HAS_I422TORGBAROW_SSSE3
-#define HAS_ABGRTOARGBROW_SSSE3
 #define HAS_RGBATOARGBROW_SSSE3
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
@ -105,36 +106,42 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_MIRRORROW_SSE2
 #define HAS_ARGBATTENUATE_SSE2
 #define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
 #endif

 // The following are available on Neon platforms
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_MIRRORROW_NEON
-#define HAS_MIRRORROWUV_NEON
-#define HAS_SPLITUV_NEON
 #define HAS_COPYROW_NEON
+#define HAS_I422TOABGRROW_NEON
 #define HAS_I422TOARGBROW_NEON
 #define HAS_I422TOBGRAROW_NEON
-#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
 #define HAS_I422TORGBAROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
-#define HAS_YUY2TOUVROW_NEON
-#define HAS_YUY2TOYROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORROWUV_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUV_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
 #define HAS_UYVYTOYROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
 // TODO(fbarchard): Hook these up to calling functions.
-#define HAS_ARGBTORGBAROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORAWROW_NEON
 #define HAS_ABGRTOARGBROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGBAROW_NEON
 #define HAS_BGRATOARGBROW_NEON
-#define HAS_RGBATOARGBROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV21TOARGBROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGBATOARGBROW_NEON
 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER)
@ -189,6 +196,24 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
                        const uint8* v_buf,
                        uint8* rgb_buf,
                        int width);
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width);
+void I422ToRAWRow_NEON(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width);
+void NV12ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width);
+void NV21ToARGBRow_NEON(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* rgb_buf,
+                        int width);

 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
@ -237,6 +262,15 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);

+void SetRow8_X86(uint8* dst, uint32 v32, int count);
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height);
+void SetRow8_NEON(uint8* dst, uint32 v32, int count);
+void SetRows32_NEON(uint8* dst, uint32 v32, int width,
+                    int dst_stride, int height);
+void SetRow8_C(uint8* dst, uint32 v32, int count);
+void SetRows32_C(uint8* dst, uint32 v32, int width, int dst_stride, int height);
+
 void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
 void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
@ -341,6 +375,16 @@ void I422ToRGBARow_C(const uint8* y_buf,
                     const uint8* v_buf,
                     uint8* rgba_buf,
                     int width);
+void I422ToRGB24Row_C(const uint8* y_buf,
+                      const uint8* u_buf,
+                      const uint8* v_buf,
+                      uint8* rgb24_buf,
+                      int width);
+void I422ToRAWRow_C(const uint8* y_buf,
+                    const uint8* u_buf,
+                    const uint8* v_buf,
+                    uint8* raw_buf,
+                    int width);

 void YToARGBRow_C(const uint8* y_buf,
                  uint8* rgb_buf,
@ -517,30 +561,44 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-
 void I422ToARGBRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-
 void I422ToBGRARow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-
 void I422ToABGRRow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
-
 void I422ToRGBARow_Any_NEON(const uint8* y_buf,
                            const uint8* u_buf,
                            const uint8* v_buf,
                            uint8* rgb_buf,
                            int width);
+void I422ToRGB24Row_Any_NEON(const uint8* y_buf,
+                             const uint8* u_buf,
+                             const uint8* v_buf,
+                             uint8* rgb_buf,
+                             int width);
+void I422ToRAWRow_Any_NEON(const uint8* y_buf,
+                           const uint8* u_buf,
+                           const uint8* v_buf,
+                           uint8* rgb_buf,
+                           int width);
+void NV12ToARGBRow_Any_NEON(const uint8* y_buf,
+                            const uint8* uv_buf,
+                            uint8* argb_buf,
+                            int width);
+void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
+                            const uint8* uv_buf,
+                            uint8* argb_buf,
+                            int width);

 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
@ -671,4 +729,3 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

 #endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT

-
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 395
+#define LIBYUV_VERSION 396

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/libyuv.gyp
+++ b/libyuv.gyp
@ -64,6 +64,7 @@

        # sources.
        'source/compare.cc',
+        'source/compare_neon.cc',
        'source/convert.cc',
        'source/convert_argb.cc',
        'source/convert_from.cc',
@ -79,6 +80,7 @@
        'source/row_posix.cc',
        'source/row_win.cc',
        'source/scale.cc',
+        'source/scale_neon.cc',
        'source/scale_argb.cc',
        'source/video_common.cc',
      ],
--- a/source/compare.cc
+++ b/source/compare.cc
@ -244,44 +244,10 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
  return seed;
 }

-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SUMSQUAREERROR_NEON

-static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
-                                  int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q7, #0                         \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-
-    "1:                                        \n"
-    "vld1.u8    {q0}, [%0]!                    \n"
-    "vld1.u8    {q1}, [%1]!                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q7, d4, d4                     \n"
-    "vmlal.s16  q8, d6, d6                     \n"
-    "vmlal.s16  q8, d5, d5                     \n"
-    "vmlal.s16  q10, d7, d7                    \n"
-    "subs       %2, %2, #16                    \n"
-    "bgt        1b                             \n"
-
-    "vadd.u32   q7, q7, q8                     \n"
-    "vadd.u32   q9, q9, q10                    \n"
-    "vadd.u32   q10, q7, q9                    \n"
-    "vpaddl.u32 q1, q10                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
-  return sse;
-}
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);

 #elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SUMSQUAREERROR_SSE2
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@ -0,0 +1,62 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.u8    {q0}, [%0]!                    \n"
+    "vld1.u8    {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
--- a/source/convert.cc
+++ b/source/convert.cc
@ -62,6 +62,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
  return 0;
 }

+// Move to row_win etc.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_HALFROW_SSE2
 __declspec(naked) __declspec(align(16))
@ -188,7 +189,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,

 // Blends 32x2 pixels to 16x1
 // source in scale.cc
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+#if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                           uint8* dst, int dst_width);
@ -393,7 +394,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
 // M420 format description:
 // M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
 // Chroma is half width / half height. (420)
-// src_stride_m420 is row planar.  Normally this will be the width in pixels.
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to
 //   this as well as the two Y planes.
 static int X420ToI420(const uint8* src_y,
@ -592,10 +593,10 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
 //   This policy assumes that the caller handles the last row of an odd height
 //   image using C.
 // READSAFE_PAGE - enable read ahead within same page.
-//   A page is 4096 bytes.  When reading ahead, if the last pixel is near the
+//   A page is 4096 bytes. When reading ahead, if the last pixel is near the
 //   end the page, and a read spans the page into the next page, a memory
 //   exception can occur if that page has not been allocated, or is a guard
-//   page.  This setting ensures the overread is within the same page.
+//   page. This setting ensures the overread is within the same page.
 // READSAFE_ALWAYS - enables read ahead on systems without memory exceptions
 //   or where buffers are padded by 64 bytes.

@ -790,7 +791,7 @@ static inline uint32 READWORD(const uint8* p) {
 }
 #endif

-// Must be multiple of 6 pixels.  Will over convert to handle remainder.
+// Must be multiple of 6 pixels. Will over convert to handle remainder.
 // https://developer.apple.com/quicktime/icefloe/dispatch019.html#v210
 static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
  for (int x = 0; x < width; x += 6) {
@ -820,7 +821,7 @@ static void V210ToUYVYRow_C(const uint8* src_v210, uint8* dst_uyvy, int width) {
 }

 // Convert V210 to I420.
-// V210 is 10 bit version of UYVY.  16 bytes to store 6 pixels.
+// V210 is 10 bit version of UYVY. 16 bytes to store 6 pixels.
 // With is multiple of 48.
 LIBYUV_API
 int V210ToI420(const uint8* src_v210, int src_stride_v210,
@ -1611,7 +1612,7 @@ static void JpegI400ToI420(void* opaque,
 }

 // MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement.  dw and dh may be enough.
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
 LIBYUV_API
 int MJPGToI420(const uint8* sample,
               size_t sample_size,
@ -1689,7 +1690,7 @@ int MJPGToI420(const uint8* sample,
      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
    } else {
      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice.  411 is supported by libjpeg
+      // factors that occur in practice. 411 is supported by libjpeg
      // ERROR: Unable to convert MJPEG frame because format is not supported
      mjpeg_decoder.UnloadFrame();
      return 1;
@ -1734,7 +1735,7 @@ int ConvertToI420(const uint8* sample,
  }
  int r = 0;

-  // One pass rotation is available for some formats.  For the rest, convert
+  // One pass rotation is available for some formats. For the rest, convert
  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
  // and then rotate the I420 to the final destination buffer.
  // For in-place conversion, if destination y is same as source sample,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -556,6 +556,14 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_NEON;
+    }
+  }
+#endif

  for (int y = 0; y < height; ++y) {
    NV12ToARGBRow(src_y, src_uv, dst_argb, width);
@ -571,10 +579,10 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
 // Convert NV21 to ARGB.
 LIBYUV_API
 int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
+               const uint8* src_uv, int src_stride_uv,
               uint8* dst_argb, int dst_stride_argb,
               int width, int height) {
-  if (!src_y || !src_vu || !dst_argb ||
+  if (!src_y || !src_uv || !dst_argb ||
      width <= 0 || height == 0) {
    return -1;
  }
@ -585,7 +593,7 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
    dst_stride_argb = -dst_stride_argb;
  }
  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* vu_buf,
+                        const uint8* uv_buf,
                        uint8* rgb_buf,
                        int width) = NV21ToARGBRow_C;
 #if defined(HAS_NV21TOARGBROW_SSSE3)
@ -599,13 +607,21 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
    }
  }
 #endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_NEON;
+    }
+  }
+#endif

  for (int y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_vu, dst_argb, width);
+    NV21ToARGBRow(src_y, src_uv, dst_argb, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
-      src_vu += src_stride_vu;
+      src_uv += src_stride_uv;
    }
  }
  return 0;
@ -890,7 +906,7 @@ static void JpegI400ToARGB(void* opaque,
 }

 // MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement.  dw and dh may be enough.
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
 LIBYUV_API
 int MJPGToARGB(const uint8* sample,
               size_t sample_size,
@ -966,7 +982,7 @@ int MJPGToARGB(const uint8* sample,
      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
    } else {
      // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice.  411 is supported by libjpeg
+      // factors that occur in practice. 411 is supported by libjpeg
      // ERROR: Unable to convert MJPEG frame because format is not supported
      mjpeg_decoder.UnloadFrame();
      return 1;
@ -1004,7 +1020,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
  }
  int r = 0;

-  // One pass rotation is available for some formats.  For the rest, convert
+  // One pass rotation is available for some formats. For the rest, convert
  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
  // and then rotate the I420 to the final destination buffer.
  // For in-place conversion, if destination dst_argb is same as source sample,
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -203,7 +203,7 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
  return 0;
 }

-// Copy to I400.  Source can be I420,422,444,400,NV12,NV21
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
 int I400Copy(const uint8* src_y, int src_stride_y,
             uint8* dst_y, int dst_stride_y,
@ -895,68 +895,50 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
 }

 // Convert I420 to RGB24.
-// TODO(fbarchard): One step I420ToRGB24Row_NEON.
 LIBYUV_API
 int I420ToRGB24(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
                const uint8* src_v, int src_stride_v,
-                uint8* dst_argb, int dst_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
                int width, int height) {
  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
+      !dst_rgb24 ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+  void (*I422ToRGB24Row)(const uint8* y_buf,
+                         const uint8* u_buf,
+                         const uint8* v_buf,
+                         uint8* rgb_buf,
+                         int width) = I422ToRGB24Row_C;
+#if defined(HAS_I422TORGB24ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
-  }
-#endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRGB24Row_C;
-#if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
-    }
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+    I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_NEON;
    }
  }
-#endif
-#if defined(HAS_ARGBTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
-    }
+#elif defined(HAS_I422TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+      I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+        I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+      }
    }
  }
 #endif

  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToRGB24Row(row, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+    dst_rgb24 += dst_stride_rgb24;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
@ -967,67 +949,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
 }

 // Convert I420 to RAW.
-// TODO(fbarchard): One step I420ToRAWRow_NEON.
 LIBYUV_API
 int I420ToRAW(const uint8* src_y, int src_stride_y,
              const uint8* src_u, int src_stride_u,
              const uint8* src_v, int src_stride_v,
-              uint8* dst_argb, int dst_stride_argb,
+              uint8* dst_raw, int dst_stride_raw,
              int width, int height) {
  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
+      !dst_raw ||
      width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+    dst_stride_raw = -dst_stride_raw;
  }
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-#if defined(HAS_I422TOARGBROW_NEON)
+  void (*I422ToRAWRow)(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) = I422ToRAWRow_C;
+#if defined(HAS_I422TORAWROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToARGBRow = I422ToARGBRow_NEON;
-  }
-#elif defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    I422ToARGBRow = I422ToARGBRow_SSSE3;
-  }
-#endif
-
-  SIMD_ALIGNED(uint8 row[kMaxStride]);
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
-      ARGBToRAWRow_C;
-#if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
-    }
-    if (IS_ALIGNED(width, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+    I422ToRAWRow = I422ToRAWRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRAWRow = I422ToRAWRow_NEON;
    }
  }
-#elif defined(HAS_ARGBTORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    if (width * 3 <= kMaxStride) {
-      ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
-    }
+#elif defined(HAS_I422TORAWROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+    I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
-      ARGBToRAWRow = ARGBToRAWRow_NEON;
+      I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+        I422ToRAWRow = I422ToRAWRow_SSSE3;
+      }
    }
  }
 #endif

  for (int y = 0; y < height; ++y) {
-    I422ToARGBRow(src_y, src_u, src_v, row, width);
-    ARGBToRAWRow(row, dst_argb, width);
-    dst_argb += dst_stride_argb;
+    I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+    dst_raw += dst_stride_raw;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -29,7 +29,7 @@
 // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
 #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
 static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (
+  asm volatile (  // NOLINT
    "mov %%ebx, %%edi                          \n"
    "cpuid                                     \n"
    "xchg %%edi, %%ebx                         \n"
@ -38,7 +38,7 @@ static __inline void __cpuid(int cpu_info[4], int info_type) {
 }
 #elif defined(__i386__) || defined(__x86_64__)
 static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (
+  asm volatile (  // NOLINT
    "cpuid                                     \n"
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type));
@ -50,7 +50,7 @@ namespace libyuv {
 extern "C" {
 #endif

-// Low level cpuid for X86.  Returns zeros on other CPUs.
+// Low level cpuid for X86. Returns zeros on other CPUs.
 #if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
    defined(__i386__) || defined(__x86_64__))
 LIBYUV_API
@ -85,7 +85,7 @@ static uint32 XGetBV(unsigned int xcr) {
 #define HAS_XGETBV
 static uint32 XGetBV(unsigned int xcr) {
  uint32 xcr_feature_mask;
-  asm volatile (
+  asm volatile (  // NOLINT
    ".byte 0x0f, 0x01, 0xd0\n"
    : "=a"(xcr_feature_mask)
    : "c"(xcr)
@ -124,6 +124,18 @@ int ArmCpuCaps(const char* cpuinfo_name) {
 LIBYUV_API
 int cpu_info_ = 0;

+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+static bool TestEnv(const char* name) {
+  const char* var = getenv(name);
+  if (var) {
+    if (var[0] != '0') {
+      return true;
+    }
+  }
+  return false;
+}
+
 LIBYUV_API
 int InitCpuFlags(void) {
 #if !defined(__CLR_VER) && defined(CPU_X86)
@ -144,34 +156,33 @@ int InitCpuFlags(void) {
    }
  }
 #endif
-
  // environment variable overrides for testing.
-  if (getenv("LIBYUV_DISABLE_X86")) {
+  if (TestEnv("LIBYUV_DISABLE_X86")) {
    cpu_info_ &= ~kCpuHasX86;
  }
-  if (getenv("LIBYUV_DISABLE_SSE2")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
    cpu_info_ &= ~kCpuHasSSE2;
  }
-  if (getenv("LIBYUV_DISABLE_SSSE3")) {
+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
    cpu_info_ &= ~kCpuHasSSSE3;
  }
-  if (getenv("LIBYUV_DISABLE_SSE41")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
    cpu_info_ &= ~kCpuHasSSE41;
  }
-  if (getenv("LIBYUV_DISABLE_SSE42")) {
+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
    cpu_info_ &= ~kCpuHasSSE42;
  }
-  if (getenv("LIBYUV_DISABLE_AVX")) {
+  if (TestEnv("LIBYUV_DISABLE_AVX")) {
    cpu_info_ &= ~kCpuHasAVX;
  }
-  if (getenv("LIBYUV_DISABLE_AVX2")) {
+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
    cpu_info_ &= ~kCpuHasAVX2;
  }
-  if (getenv("LIBYUV_DISABLE_ASM")) {
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
    cpu_info_ = kCpuInitialized;
  }
 #elif defined(__arm__)
-#if defined(__linux__) && defined(__ARM_NEON__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
  // linux arm parse text file for neon detect.
  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
 #elif defined(__ARM_NEON__)
@ -181,10 +192,10 @@ int InitCpuFlags(void) {
  cpu_info_ = kCpuHasNEON;
 #endif
  cpu_info_ |= kCpuInitialized | kCpuHasARM;
-  if (getenv("LIBYUV_DISABLE_NEON")) {
+  if (TestEnv("LIBYUV_DISABLE_NEON")) {
    cpu_info_ &= ~kCpuHasNEON;
  }
-  if (getenv("LIBYUV_DISABLE_ASM")) {
+  if (TestEnv("LIBYUV_DISABLE_ASM")) {
    cpu_info_ = kCpuInitialized;
  }
 #endif  // __arm__
--- a/source/format_conversion.cc
+++ b/source/format_conversion.cc
@ -21,7 +21,7 @@ extern "C" {
 #endif

 // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
-// and vst would select which 2 components to write.  The low level would need
+// and vst would select which 2 components to write. The low level would need
 // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR

 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
--- a/source/mjpeg_decoder.cc
+++ b/source/mjpeg_decoder.cc
@ -10,6 +10,7 @@

 #include "libyuv/mjpeg_decoder.h"

+#ifdef HAVE_JPEG
 // Must be included before jpeglib
 #include <assert.h>
 #ifndef __CLR_VER
@ -80,7 +81,7 @@ MJpegDecoder::~MJpegDecoder() {
 }

 // Helper function to validate the jpeg looks ok.
-// TODO(fbarchard): Improve performance.  Scan backward for EOI?
+// TODO(fbarchard): Improve performance. Scan backward for EOI?
 bool ValidateJpeg(const uint8* sample, size_t sample_size) {
  if (sample_size < 64) {
    // ERROR: Invalid jpeg size: sample_size
@ -105,7 +106,7 @@ bool ValidateJpeg(const uint8* sample, size_t sample_size) {
    }
  }
  if (!total_eoi) {
-    // ERROR: Invalid jpeg end code not found.  Size sample_size
+    // ERROR: Invalid jpeg end code not found. Size sample_size
    return false;
  }
  return true;
@ -578,3 +579,5 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
 }

 }  // namespace libyuv
+#endif  // HAVE_JPEG
+
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -105,6 +105,130 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
  }
 }

+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*YUY2ToYRow)(const uint8* src_yuy2,
+                     uint8* dst_y, int pix);
+  YUY2ToYRow = YUY2ToYRow_C;
+  YUY2ToUV422Row = YUY2ToUV422Row_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          YUY2ToYRow = YUY2ToYRow_SSE2;
+        }
+      }
+    }
+  }
+#elif defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      YUY2ToYRow = YUY2ToYRow_Any_NEON;
+      if (width > 16) {
+        YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+    src_stride_uyvy = -src_stride_uyvy;
+  }
+  void (*UYVYToUV422Row)(const uint8* src_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix);
+  void (*UYVYToYRow)(const uint8* src_uyvy,
+                     uint8* dst_y, int pix);
+  UYVYToYRow = UYVYToYRow_C;
+  UYVYToUV422Row = UYVYToUV422Row_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    if (width > 16) {
+      UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+      UYVYToYRow = UYVYToYRow_Any_SSE2;
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+        UYVYToUV422Row = UYVYToUV422Row_SSE2;
+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+          UYVYToYRow = UYVYToYRow_SSE2;
+        }
+      }
+    }
+  }
+#elif defined(HAS_UYVYTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    if (width > 8) {
+      UYVYToYRow = UYVYToYRow_Any_NEON;
+      if (width > 16) {
+        UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+      }
+    }
+    if (IS_ALIGNED(width, 16)) {
+      UYVYToYRow = UYVYToYRow_NEON;
+      UYVYToUV422Row = UYVYToUV422Row_NEON;
+    }
+  }
+#endif
+
+  for (int y = 0; y < height; ++y) {
+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+    UYVYToYRow(src_uyvy, dst_y, width);
+    src_uyvy += src_stride_uyvy;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
 // Mirror I420 with optional flipping
 LIBYUV_API
 int I420Mirror(const uint8* src_y, int src_stride_y,
@ -721,6 +845,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
    NV12ToARGBRow = NV12ToARGBRow_SSSE3;
  }
 #endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) {
+    NV12ToARGBRow = NV12ToARGBRow_NEON;
+  }
+#endif

  SIMD_ALIGNED(uint8 row[kMaxStride]);
  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
@ -789,129 +918,6 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
  return 0;
 }

-// SetRow8 writes 'count' bytes using a 32 bit value repeated
-// SetRow32 writes 'count' words using a 32 bit value repeated
-
-#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
-#define HAS_SETROW_NEON
-static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (  // NOLINT
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-    "1:                                        \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    "vst1.u32  {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-    : "+r"(dst),   // %0
-      "+r"(count)  // %1
-    : "r"(v32)     // %2
-    : "q0", "memory", "cc");
-}
-
-// TODO(fbarchard): Make fully assembler
-static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
-                           int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow8_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
-#define HAS_SETROW_X86
-__declspec(naked) __declspec(align(16))
-static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
-  __asm {
-    mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
-    mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep stosd
-    mov        edi, edx
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-static void SetRows32_X86(uint8* dst, uint32 v32, int width,
-                         int dst_stride, int height) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebp
-    mov        edi, [esp + 12 + 4]   // dst
-    mov        eax, [esp + 12 + 8]   // v32
-    mov        ebp, [esp + 12 + 12]  // width
-    mov        edx, [esp + 12 + 16]  // dst_stride
-    mov        esi, [esp + 12 + 20]  // height
-    lea        ecx, [ebp * 4]
-    sub        edx, ecx             // stride - width * 4
-
-    align      16
-  convertloop:
-    mov        ecx, ebp
-    rep stosd
-    add        edi, edx
-    sub        esi, 1
-    jg         convertloop
-
-    pop        ebp
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
-#define HAS_SETROW_X86
-static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
-  size_t width_tmp = static_cast<size_t>(width);
-  asm volatile (  // NOLINT
-    "shr       $0x2,%1                         \n"
-    "rep stosl                                 \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
-}
-
-static void SetRows32_X86(uint8* dst, uint32 v32, int width,
-                         int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    size_t width_tmp = static_cast<size_t>(width);
-    uint32* d = reinterpret_cast<uint32*>(dst);
-    asm volatile (  // NOLINT
-      "rep stosl                               \n"
-      : "+D"(d),         // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-    dst += dst_stride;
-  }
-}
-#endif
-
-static void SetRow8_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
-  for (int x = 0; x < count; ++x) {
-    dst[x] = v8;
-  }
-#else
-  memset(dst, v8, count);
-#endif
-}
-
-static void SetRows32_C(uint8* dst, uint32 v32, int width,
-                        int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    uint32* d = reinterpret_cast<uint32*>(dst);
-    for (int x = 0; x < width; ++x) {
-      d[x] = v32;
-    }
-    dst += dst_stride;
-  }
-}
-
 LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
              int width, int height,
@ -929,13 +935,6 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
    SetRow = SetRow8_X86;
  }
 #endif
-#if defined(HAS_SETROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    SetRow = SetRow8_SSE2;
-  }
-#endif

  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
  // Set plane
@ -1242,7 +1241,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
 }

 // Computes table of cumulative sum for image where the value is the sum
-// of all values above and to the left of the entry.  Used by ARGBBlur.
+// of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
 int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
                             int32* dst_cumsum, int dst_stride32_cumsum,
@ -1270,7 +1269,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,

 // Blur ARGB image.
 // Caller should allocate CumulativeSum table of width * height * 16 bytes
-// aligned to 16 byte boundary.  height can be radius * 2 + 2 to save memory
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
 // as the buffer is treated as circular.
 LIBYUV_API
 int ARGBBlur(const uint8* src_argb, int src_stride_argb,
@ -1290,7 +1289,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
    CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
  }
 #endif
-  // Compute enough CumulativeSum for first row to be blurred.  After this
+  // Compute enough CumulativeSum for first row to be blurred. After this
  // one row of CumulativeSum is updated at a time.
  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
                           dst_cumsum, dst_stride32_cumsum,
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -814,7 +814,7 @@ void RotatePlane90(const uint8* src, int src_stride,
                   uint8* dst, int dst_stride,
                   int width, int height) {
  // Rotate by 90 is a transpose with the source read
-  // from bottom to top.  So set the source pointer to the end
+  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
  src += src_stride * (height - 1);
  src_stride = -src_stride;
@ -826,7 +826,7 @@ void RotatePlane270(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  // Rotate by 270 is a transpose with the destination written
-  // from bottom to top.  So set the destination pointer to the end
+  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;
@ -880,7 +880,7 @@ void RotatePlane180(const uint8* src, int src_stride,
  if (width > kMaxStride) {
    return;
  }
-  // Swap first and last row and mirror the content.  Uses a temporary row.
+  // Swap first and last row and mirror the content. Uses a temporary row.
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  const uint8* src_bot = src + src_stride * (height - 1);
  uint8* dst_bot = dst + dst_stride * (height - 1);
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -58,7 +58,7 @@ void ARGBRotate90(const uint8* src, int src_stride,
                  uint8* dst, int dst_stride,
                  int width, int height) {
  // Rotate by 90 is a ARGBTranspose with the source read
-  // from bottom to top.  So set the source pointer to the end
+  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
  src += src_stride * (height - 1);
  src_stride = -src_stride;
@ -69,7 +69,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
  // Rotate by 270 is a ARGBTranspose with the destination written
-  // from bottom to top.  So set the destination pointer to the end
+  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
  dst += dst_stride * (width - 1);
  dst_stride = -dst_stride;
@ -109,7 +109,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
  if (width * 4 > kMaxStride) {
    return;
  }
-  // Swap first and last row and mirror the content.  Uses a temporary row.
+  // Swap first and last row and mirror the content. Uses a temporary row.
  SIMD_ALIGNED(uint8 row[kMaxStride]);
  const uint8* src_bot = src + src_stride * (height - 1);
  uint8* dst_bot = dst + dst_stride * (height - 1);
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -26,12 +26,12 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
                       uint8* dst, int dst_stride,
                       int width) {
  asm volatile (
-    // loops are on blocks of 8.  loop will stop when
-    // counter gets to or below 0.  starting the counter
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
    "sub         %4, #8                        \n"

-    // handle 8x8 blocks.  this should be the majority of the plane
+    // handle 8x8 blocks. this should be the majority of the plane
    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"
@ -81,7 +81,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
      "subs        %4,  #8                     \n"  // w   -= 8
      "bge         1b                          \n"

-    // add 8 back to counter.  if the result is 0 there are
+    // add 8 back to counter. if the result is 0 there are
    // no residuals.
    "adds        %4, #8                        \n"
    "beq         4f                            \n"
@ -193,12 +193,12 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                         uint8* dst_b, int dst_stride_b,
                         int width) {
  asm volatile (
-    // loops are on blocks of 8.  loop will stop when
-    // counter gets to or below 0.  starting the counter
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
    // at w-8 allow for this
    "sub         %6, #8                        \n"

-    // handle 8x8 blocks.  this should be the majority of the plane
+    // handle 8x8 blocks. this should be the majority of the plane
    ".p2align  4                               \n"
    "1:                                        \n"
      "mov         r9, %0                      \n"
@ -264,7 +264,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
      "subs        %6,  #8                     \n"  // w     -= 8
      "bge         1b                          \n"

-    // add 8 back to counter.  if the result is 0 there are
+    // add 8 back to counter. if the result is 0 there are
    // no residuals.
    "adds        %6, #8                        \n"
    "beq         4f                            \n"
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -330,7 +330,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
    int sb = (b * 17 + g * 68 + r * 35) >> 7;
    int sg = (b * 22 + g * 88 + r * 45) >> 7;
    int sr = (b * 24 + g * 98 + r * 50) >> 7;
-    // b does not over flow.  a is preserved from original.
+    // b does not over flow. a is preserved from original.
    if (sg > 255) {
      sg = 255;
    }
@ -344,7 +344,7 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
  }
 }

-// Apply color matrix to a row of image.  Matrix is signed.
+// Apply color matrix to a row of image. Matrix is signed.
 void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
  for (int x = 0; x < width; ++x) {
    int b = dst_argb[0];
@ -459,6 +459,14 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf,
                                        (255u << ashift);
 }

+static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v,
+                               uint8* b, uint8* g, uint8* r) {
+  int32 y1 = (static_cast<int32>(y) - 16) * YG;
+  *b = Clip(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+  *g = Clip(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+  *r = Clip(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+}
+
 void I444ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
@ -492,6 +500,48 @@ void I422ToARGBRow_C(const uint8* y_buf,
  }
 }

+void I422ToRGB24Row_C(const uint8* y_buf,
+                      const uint8* u_buf,
+                      const uint8* v_buf,
+                      uint8* rgb_buf,
+                      int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+              rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
+void I422ToRAWRow_C(const uint8* y_buf,
+                    const uint8* u_buf,
+                    const uint8* v_buf,
+                    uint8* rgb_buf,
+                    int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+    YuvPixel2(y_buf[1], u_buf[0], v_buf[0],
+              rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+    y_buf += 2;
+    u_buf += 1;
+    v_buf += 1;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel2(y_buf[0], u_buf[0], v_buf[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+  }
+}
+
 void I411ToARGBRow_C(const uint8* y_buf,
                     const uint8* u_buf,
                     const uint8* v_buf,
@ -671,6 +721,28 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
  memcpy(dst, src, count);
 }

+void SetRow8_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+  // VC will generate rep stosb.
+  for (int x = 0; x < count; ++x) {
+    dst[x] = v8;
+  }
+#else
+  memset(dst, v8, count);
+#endif
+}
+
+void SetRows32_C(uint8* dst, uint32 v32, int width,
+                 int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    uint32* d = reinterpret_cast<uint32*>(dst);
+    for (int x = 0; x < width; ++x) {
+      d[x] = v32;
+    }
+    dst += dst_stride;
+  }
+}
+
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
 void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
                   uint8* dst_u, uint8* dst_v, int width) {
@ -950,6 +1022,11 @@ Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
 YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
 #endif
+#ifdef HAS_I422TORGB24ROW_SSSE3
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3,                 \
+     I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1)
+#endif
 #ifdef HAS_I422TORGBAROW_SSSE3
 YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
 #endif
@ -958,6 +1035,10 @@ YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
 YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
 YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
 YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
+Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0)
+Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0)
+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1)
+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1)
 #endif
 #undef YANY

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -101,8 +101,8 @@ void I422ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
-      "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOARGBROW_NEON
@ -135,8 +135,8 @@ void I422ToBGRARow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOBGRAROW_NEON
@ -169,8 +169,8 @@ void I422ToABGRRow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TOABGRROW_NEON
@ -202,12 +202,77 @@ void I422ToRGBARow_NEON(const uint8* y_buf,
      "+r"(width)     // %4
    : "r"(&kUVToRB),  // %5
      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9",
-                      "q10", "q11", "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_I422TORGBAROW_NEON

+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* y_buf,
+                       const uint8* u_buf,
+                       const uint8* v_buf,
+                       uint8* rgb_buf,
+                       int width) {
+  asm volatile (
+    "vld1.u8    {d24}, [%5]                    \n"
+    "vld1.u8    {d25}, [%6]                    \n"
+    "vmov.u8    d26, #128                      \n"
+    "vmov.u16   q14, #74                       \n"
+    "vmov.u16   q15, #16                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    READYUV422
+    YUV422TORGB
+    "subs       %4, %4, #8                     \n"
+    "vswp.u8    d20, d22                       \n"
+    "vst3.8     {d20, d21, d22}, [%3]!         \n"
+    "bgt        1b                             \n"
+    : "+r"(y_buf),    // %0
+      "+r"(u_buf),    // %1
+      "+r"(v_buf),    // %2
+      "+r"(rgb_buf),  // %3
+      "+r"(width)     // %4
+    : "r"(&kUVToRB),  // %5
+      "r"(&kUVToG)    // %6
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_I422TORAWROW_NEON
+
 #ifdef HAS_NV12TOARGBROW_NEON
 void NV12ToARGBRow_NEON(const uint8* y_buf,
                        const uint8* uv_buf,
@ -233,8 +298,8 @@ void NV12ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %3
    : "r"(&kUVToRB),  // %4
      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
-      "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_NV12TOARGBROW_NEON
@ -264,8 +329,8 @@ void NV21ToARGBRow_NEON(const uint8* y_buf,
      "+r"(width)     // %3
    : "r"(&kUVToRB),  // %4
      "r"(&kUVToG)    // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
-      "q12", "q13", "q14", "q15"
+    : "cc", "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
  );
 }
 #endif  // HAS_NV21TOARGBROW_NEON
@ -312,7 +377,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_NEON

 #ifdef HAS_SETROW_NEON
-// SetRow8 writes 'count' bytes using a 32 bit value repeated
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
 void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
  asm volatile (  // NOLINT
    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
@ -327,7 +392,7 @@ void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
 }

 // TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated
+// SetRow32 writes 'count' words using a 32 bit value repeated.
 void SetRows32_NEON(uint8* dst, uint32 v32, int width,
                    int dst_stride, int height) {
  for (int y = 0; y < height; ++y) {
@ -344,11 +409,11 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    "add         %1, %2                        \n"
    // work on segments that are multiples of 16
    "lsrs        r3, %2, #4                    \n"
-    // the output is written in two block.  8 bytes followed
-    // by another 8.  reading is done sequentially, from left to
-    // right.  writing is done from right to left in block sizes
+    // the output is written in two block. 8 bytes followed
+    // by another 8. reading is done sequentially, from left to
+    // right. writing is done from right to left in block sizes
    // %1, the destination pointer is incremented after writing
-    // the first of the two blocks.  need to subtract that 8 off
+    // the first of the two blocks. need to subtract that 8 off
    // along with 16 to get the next location.
    "mov         r3, #-24                      \n"
    "beq         2f                            \n"
@ -356,9 +421,9 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    // back of destination by the size of the register that is
    // going to be mirrored
    "sub         %1, #16                       \n"
-    // the loop needs to run on blocks of 16.  what will be left
+    // the loop needs to run on blocks of 16. what will be left
    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
+    // to be done, or 0. If this isn't subtracted off here the
    // loop will run one extra time.
    "sub         %2, #16                       \n"

@ -375,7 +440,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
    "vst1.8      {d0}, [%1], r3                \n"  // dst -= 16
    "bge         1b                            \n"

-    // add 16 back to the counter.  if the result is 0 there is no
+    // add 16 back to the counter. if the result is 0 there is no
    // residuals so jump past
    "adds        %2, #16                       \n"
    "beq         5f                            \n"
@ -430,9 +495,9 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
    // going to be mirrord
    "sub         %1, #8                        \n"
    "sub         %2, #8                        \n"
-    // the loop needs to run on blocks of 8.  what will be left
+    // the loop needs to run on blocks of 8. what will be left
    // over is either a negative number, the residuals that need
-    // to be done, or 0.  if this isn't subtracted off here the
+    // to be done, or 0. if this isn't subtracted off here the
    // loop will run one extra time.
    "sub         %3, #8                        \n"

@ -446,7 +511,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
    "vst1.8      {d1}, [%2], r12               \n"  // dst_b -= 8
    "bge         1b                            \n"

-    // add 8 back to the counter.  if the result is 0 there is no
+    // add 8 back to the counter. if the result is 0 there is no
    // residuals so return
    "adds        %3, #8                        \n"
    "beq         4f                            \n"
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -741,9 +741,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 }

 // TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers.  "m" effectively takes
-// 3 registers - ebx, ebp and eax.  "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled.  Doing 2 assembly blocks is a work around
+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
 // and considered unsafe.
 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
@ -2143,6 +2143,34 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_COPYROW_X86

+#ifdef HAS_SETROW_X86
+void SetRow8_X86(uint8* dst, uint32 v32, int width) {
+  size_t width_tmp = static_cast<size_t>(width);
+  asm volatile (
+    "shr       $0x2,%1                         \n"
+    "rep stosl                                 \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
+}
+
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height) {
+  for (int y = 0; y < height; ++y) {
+    size_t width_tmp = static_cast<size_t>(width);
+    uint32* d = reinterpret_cast<uint32*>(dst);
+    asm volatile (
+      "rep stosl                               \n"
+      : "+D"(d),         // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
+    dst += dst_stride;
+  }
+}
+#endif  // HAS_SETROW_X86
+
 #ifdef HAS_YUY2TOYROW_SSE2
 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
  asm volatile (
@ -2998,7 +3026,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2

 #ifdef HAS_ARGBGRAYROW_SSSE3
-// Constant for ARGB color to gray scale.  0.11 * B + 0.59 * G + 0.30 * R
+// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
 CONST vec8 kARGBToGray = {
  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
 };
@ -3455,7 +3483,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
 // Copy ARGB pixels from source image with slope to a row of destination.
 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
-// an error if movq is used.  movd  %%xmm0,%1
+// an error if movq is used. movd  %%xmm0,%1

 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -18,6 +18,7 @@ extern "C" {
 // This module is for Visual C x86.
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)

+// TODO(fbarchard): I420ToRGB24, I420ToRAW
 #ifdef HAS_ARGBTOYROW_SSSE3

 // Constants for ARGB.
@ -2521,6 +2522,54 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_X86

+#ifdef HAS_SETROW_X86
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRow8_X86(uint8* dst, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    shr        ecx, 2
+    rep stosd
+    mov        edi, edx
+    ret
+  }
+}
+
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRows32_X86(uint8* dst, uint32 v32, int width,
+                   int dst_stride, int height) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebp
+    mov        edi, [esp + 12 + 4]   // dst
+    mov        eax, [esp + 12 + 8]   // v32
+    mov        ebp, [esp + 12 + 12]  // width
+    mov        edx, [esp + 12 + 16]  // dst_stride
+    mov        esi, [esp + 12 + 20]  // height
+    lea        ecx, [ebp * 4]
+    sub        edx, ecx             // stride - width * 4
+
+    align      16
+  convertloop:
+    mov        ecx, ebp
+    rep stosd
+    add        edi, edx
+    sub        esi, 1
+    jg         convertloop
+
+    pop        ebp
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SETROW_X86
+
 #ifdef HAS_YUY2TOYROW_SSE2
 __declspec(naked) __declspec(align(16))
 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
@ -3497,7 +3546,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-// TODO(fbarchard): packuswbs only use half of the reg.  To make RGBA, combine R
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
 __declspec(naked) __declspec(align(16))
 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
--- a/source/scale.cc
+++ b/source/scale.cc
@ -54,514 +54,49 @@ void SetUseReferenceImpl(bool use) {
 #define HAS_SCALEROWDOWN2_NEON
 // Note - not static due to reuse in convert for 444 to 420.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-    "1:                                        \n"
-    // load even pixels into q0, odd into q1
-    "vld2.u8    {q0,q1}, [%0]!                 \n"
-    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst),              // %1
-      "+r"(dst_width)         // %2
-    :
-    : "q0", "q1"              // Clobber List
-  );
-}
+                        uint8* dst, int dst_width);

 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-    "1:                                        \n"
-    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
-    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    // row 2 add adjacent, add row 1 to row 2
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vst1.u8    {q0}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(src_stride),       // %1
-      "+r"(dst),              // %2
-      "+r"(dst_width)         // %3
-    :
-    : "q0", "q1", "q2", "q3"     // Clobber List
-   );
-}
+                           uint8* dst, int dst_width);

 #define HAS_SCALEROWDOWN4_NEON
-static void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "1:                                        \n"
-    "vld2.u8    {d0, d1}, [%0]!                \n"
-    "vtrn.u8    d1, d0                         \n"
-    "vshrn.u16  d0, q0, #8                     \n"
-    "vst1.u32   {d0[1]}, [%1]!                 \n"
-
-    "subs       %2, #4                         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    :
-    : "q0", "q1", "memory", "cc"
-  );
-}
-
-static void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "add        r4, %0, %3                     \n"
-    "add        r5, r4, %3                     \n"
-    "add        %3, r5, %3                     \n"
-    "1:                                        \n"
-    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
-    "vld1.u8    {q1}, [r4]!                    \n"
-    "vld1.u8    {q2}, [r5]!                    \n"
-    "vld1.u8    {q3}, [%3]!                    \n"
-
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-
-    "vpaddl.u16 q0, q0                         \n"
-
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-
-    "vmovn.u16  d0, q0                         \n"
-    "vst1.u32   {d0[0]}, [%1]!                 \n"
-
-    "subs       %2, #4                         \n"
-    "bgt        1b                             \n"
-
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    : "r"(src_stride)         // %3
-    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
-  );
-}
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);

 #define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels.  Use the neon multilane read/write
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-static void ScaleRowDown34_NEON(const uint8* src_ptr,
-                                ptrdiff_t /* src_stride */,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vmov         d2, d3                       \n" // order d0, d1, d2
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
-    "subs         %2, #24                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    :
-    : "d0", "d1", "d2", "d3", "memory", "cc"
-  );
-}
-
-static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
-
-    // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
-
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
-
-    "subs         %2, #24                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    :
-    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
-  );
-}
-
-static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-
-    // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
-
-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
-
-    "subs         %2, #24                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    :
-    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
-  );
-}
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);

 #define HAS_SCALEROWDOWN38_NEON
-const uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
-
 // 32 -> 12
-static void ScaleRowDown38_NEON(const uint8* src_ptr,
-                                ptrdiff_t /* src_stride */,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vld1.u8      {q3}, [%3]                   \n"
-    "1:                                        \n"
-    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
-    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
-    "vst1.u8      {d4}, [%1]!                  \n"
-    "vst1.u32     {d5[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width)         // %2
-    : "r"(&kShuf38)           // %3
-    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
-  );
-}
-
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
 // 32x3 -> 12x1
-static void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
-                                             ptrdiff_t src_stride,
-                                             uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vld1.u16     {q13}, [%4]                  \n"
-    "vld1.u8      {q14}, [%5]                  \n"
-    "vld1.u8      {q15}, [%6]                  \n"
-    "add          r4, %0, %3, lsl #1           \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
-    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg.  This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded.  Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2.  So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    "vst1.u8      {d3}, [%1]!                  \n"
-    "vst1.u32     {d4[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),          // %0
-      "+r"(dst_ptr),          // %1
-      "+r"(dst_width),        // %2
-      "+r"(src_stride)        // %3
-    : "r"(&kMult38_Div6),     // %4
-      "r"(&kShuf38_2),        // %5
-      "r"(&kMult38_Div9)      // %6
-    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
-      "q13", "q14", "q15", "memory", "cc"
-  );
-}
-
+void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
 // 32x2 -> 12x1
-static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vld1.u16     {q13}, [%4]                  \n"
-    "vld1.u8      {q14}, [%5]                  \n"
-    "add          %3, %0                       \n"
-    "1:                                        \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg.  This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded.  Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2.  So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    "vst1.u8      {d3}, [%1]!                  \n"
-    "vst1.u32     {d4[0]}, [%1]!               \n"
-    "subs         %2, #12                      \n"
-    "bgt          1b                           \n"
-    : "+r"(src_ptr),       // %0
-      "+r"(dst_ptr),       // %1
-      "+r"(dst_width),     // %2
-      "+r"(src_stride)     // %3
-    : "r"(&kMult38_Div6),  // %4
-      "r"(&kShuf38_2)      // %5
-    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
-  );
-}
-
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
 // 16x2 -> 16x1
 #define HAS_SCALEFILTERROWS_NEON
-static void ScaleFilterRows_NEON(uint8* dst_ptr,
-                                 const uint8* src_ptr, ptrdiff_t src_stride,
-                                 int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp          %4, #0                       \n"
-    "beq          2f                           \n"
-    "add          %2, %1                       \n"
-    "cmp          %4, #128                     \n"
-    "beq          3f                           \n"
-
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
-    "1:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "vld1.u8      {q1}, [%2]!                  \n"
-    "subs         %3, #16                      \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
-    "b            4f                           \n"
-
-    "2:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "subs         %3, #16                      \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          2b                           \n"
-    "b            4f                           \n"
-
-    "3:                                        \n"
-    "vld1.u8      {q0}, [%1]!                  \n"
-    "vld1.u8      {q1}, [%2]!                  \n"
-    "subs         %3, #16                      \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vst1.u8      {q0}, [%0]!                  \n"
-    "bgt          3b                           \n"
-    "4:                                        \n"
-    "vst1.u8      {d1[7]}, [%0]                \n"
-    : "+r"(dst_ptr),          // %0
-      "+r"(src_ptr),          // %1
-      "+r"(src_stride),       // %2
-      "+r"(dst_width),        // %3
-      "+r"(source_y_fraction) // %4
-    :
-    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
-  );
-}
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction);

 /**
 * SSE2 downscalers with interpolation.
@ -1010,7 +545,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

 #define HAS_SCALEROWDOWN34_SSSE3
 // Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.

 // Note that movdqa+palign may be better than movdqu.
@ -1049,7 +584,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }

 // Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.

 // Register usage:
@ -3420,7 +2955,7 @@ int I420Scale(const uint8* src_y, int src_stride_y,
    dst_halfwidth = dst_width >> 1;
  }
  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane.  Detect this and reduce halfheight to match.
+  // should be the src_u plane. Detect this and reduce halfheight to match.
  int uv_src_plane_size = src_halfwidth * src_halfheight;
  if ((src_height & 1) &&
      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
@ -3484,7 +3019,7 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
    dst_halfwidth = dst_width >> 1;
  }
  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane.  Detect this and reduce halfheight to match.
+  // should be the src_u plane. Detect this and reduce halfheight to match.
  int uv_src_plane_size = src_halfwidth * src_halfheight;
  if ((src_height & 1) &&
      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -0,0 +1,534 @@
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
+
+/**
+ * NEON downscalers with interpolation.
+ *
+ * Provided by Fritz Koenig
+ *
+ */
+
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    // load even pixels into q0, odd into q1
+    "vld2.u8    {q0,q1}, [%0]!                 \n"
+    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
+    "subs       %2, %2, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst),              // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1"              // Clobber List
+  );
+}
+
+void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %0                         \n"
+    "1:                                        \n"
+    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post inc
+    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post inc
+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+    "vpaddl.u8  q1, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
+    "vpadal.u8  q1, q3                         \n"
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vst1.u8    {q0}, [%2]!                    \n"
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(src_stride),       // %1
+      "+r"(dst),              // %2
+      "+r"(dst_width)         // %3
+    :
+    : "q0", "q1", "q2", "q3"     // Clobber List
+   );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld2.u8    {d0, d1}, [%0]!                \n"
+    "vtrn.u8    d1, d0                         \n"
+    "vshrn.u16  d0, q0, #8                     \n"
+    "vst1.u32   {d0[1]}, [%1]!                 \n"
+    "subs       %2, #4                         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "q0", "q1", "memory", "cc"
+  );
+}
+
+void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "add        r4, %0, %3                     \n"
+    "add        r5, r4, %3                     \n"
+    "add        %3, r5, %3                     \n"
+    "1:                                        \n"
+    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4
+    "vld1.u8    {q1}, [r4]!                    \n"
+    "vld1.u8    {q2}, [r5]!                    \n"
+    "vld1.u8    {q3}, [%3]!                    \n"
+    "vpaddl.u8  q0, q0                         \n"
+    "vpadal.u8  q0, q1                         \n"
+    "vpadal.u8  q0, q2                         \n"
+    "vpadal.u8  q0, q3                         \n"
+    "vpaddl.u16 q0, q0                         \n"
+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
+    "vmovn.u16  d0, q0                         \n"
+    "vst1.u32   {d0[0]}, [%1]!                 \n"
+    "subs       %2, #4                         \n"
+    "bgt        1b                             \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(src_stride)         // %3
+    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+  );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vmov         d2, d3                       \n" // order d0, d1, d2
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q8, d4                       \n"
+    "vmovl.u8     q9, d5                       \n"
+    "vmovl.u8     q10, d6                      \n"
+    "vmovl.u8     q11, d7                      \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vmlal.u8     q9, d1, d24                  \n"
+    "vmlal.u8     q10, d2, d24                 \n"
+    "vmlal.u8     q11, d3, d24                 \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q8, #2                   \n"
+    "vqrshrn.u16  d1, q9, #2                   \n"
+    "vqrshrn.u16  d2, q10, #2                  \n"
+    "vqrshrn.u16  d3, q11, #2                  \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q8, d1                       \n"
+    "vmlal.u8     q8, d0, d24                  \n"
+    "vqrshrn.u16  d0, q8, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q8, d2                       \n"
+    "vmlal.u8     q8, d3, d24                  \n"
+    "vqrshrn.u16  d2, q8, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  );
+}
+
+void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vmov.u8      d24, #3                      \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
+
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2                   \n"
+    "vrhadd.u8    q1, q1, q3                   \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                       \n"
+    "vmlal.u8     q3, d0, d24                  \n"
+    "vqrshrn.u16  d0, q3, #2                   \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2                   \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                       \n"
+    "vmlal.u8     q3, d3, d24                  \n"
+    "vqrshrn.u16  d2, q3, #2                   \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
+
+    "subs         %2, #24                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uvec8 kShuf38 =
+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uvec8 kShuf38_2 =
+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const vec16 kMult38_Div6 =
+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const vec16 kMult38_Div9 =
+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u8      {q3}, [%3]                   \n"
+    "1:                                        \n"
+    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
+    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
+    "vst1.u8      {d4}, [%1]!                  \n"
+    "vst1.u32     {d5[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    : "r"(&kShuf38)           // %3
+    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
+                                      ptrdiff_t src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "vld1.u8      {q15}, [%6]                  \n"
+    "add          r4, %0, %3, lsl #1           \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+    "vtrn.u8      d16, d17                     \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+    "vtrn.u8      d18, d19                     \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+    "vpaddl.u8    q8, q8                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+    "vpaddl.u8    d19, d19                     \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     q0, q8                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+    "vadd.u16     d4, d19                      \n"
+
+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+    //             + s[6 + st * 1] + s[7 + st * 1]
+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+    "vqrdmulh.s16 q2, q2, q13                  \n"
+    "vmovn.u16    d4, q2                       \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+    "vmovl.u8     q9, d18                      \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+    "vadd.u16     q1, q9                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q15                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    : "r"(&kMult38_Div6),     // %4
+      "r"(&kShuf38_2),        // %5
+      "r"(&kMult38_Div9)      // %6
+    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+      "q13", "q14", "q15", "memory", "cc"
+  );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "vld1.u16     {q13}, [%4]                  \n"
+    "vld1.u8      {q14}, [%5]                  \n"
+    "add          %3, %0                       \n"
+    "1:                                        \n"
+
+    // d0 = 00 40 01 41 02 42 03 43
+    // d1 = 10 50 11 51 12 52 13 53
+    // d2 = 20 60 21 61 22 62 23 63
+    // d3 = 30 70 31 71 32 72 33 73
+    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
+    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
+
+    // Shuffle the input data around to get align the data
+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+    // d0 = 00 10 01 11 02 12 03 13
+    // d1 = 40 50 41 51 42 52 43 53
+    "vtrn.u8      d0, d1                       \n"
+    "vtrn.u8      d4, d5                       \n"
+
+    // d2 = 20 30 21 31 22 32 23 33
+    // d3 = 60 70 61 71 62 72 63 73
+    "vtrn.u8      d2, d3                       \n"
+    "vtrn.u8      d6, d7                       \n"
+
+    // d0 = 00+10 01+11 02+12 03+13
+    // d2 = 40+50 41+51 42+52 43+53
+    "vpaddl.u8    q0, q0                       \n"
+    "vpaddl.u8    q2, q2                       \n"
+
+    // d3 = 60+70 61+71 62+72 63+73
+    "vpaddl.u8    d3, d3                       \n"
+    "vpaddl.u8    d7, d7                       \n"
+
+    // combine source lines
+    "vadd.u16     q0, q2                       \n"
+    "vadd.u16     d4, d3, d7                   \n"
+
+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+    "vqrshrn.u16  d4, q2, #2                   \n"
+
+    // Shuffle 2,3 reg around so that 2 can be added to the
+    //  0,1 reg and 3 can be added to the 4,5 reg. This
+    //  requires expanding from u8 to u16 as the 0,1 and 4,5
+    //  registers are already expanded. Then do transposes
+    //  to get aligned.
+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "vmovl.u8     q1, d2                       \n"
+    "vmovl.u8     q3, d6                       \n"
+
+    // combine source lines
+    "vadd.u16     q1, q3                       \n"
+
+    // d4 = xx 20 xx 30 xx 22 xx 32
+    // d5 = xx 21 xx 31 xx 23 xx 33
+    "vtrn.u32     d2, d3                       \n"
+
+    // d4 = xx 20 xx 21 xx 22 xx 23
+    // d5 = xx 30 xx 31 xx 32 xx 33
+    "vtrn.u16     d2, d3                       \n"
+
+    // 0+1+2, 3+4+5
+    "vadd.u16     q0, q1                       \n"
+
+    // Need to divide, but can't downshift as the the value
+    //  isn't a power of 2. So multiply by 65536 / n
+    //  and take the upper 16 bits.
+    "vqrdmulh.s16 q0, q0, q13                  \n"
+
+    // Align for table lookup, vtbl requires registers to
+    //  be adjacent
+    "vmov.u8      d2, d4                       \n"
+
+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+    "vst1.u8      {d3}, [%1]!                  \n"
+    "vst1.u32     {d4[0]}, [%1]!               \n"
+    "subs         %2, #12                      \n"
+    "bgt          1b                           \n"
+    : "+r"(src_ptr),       // %0
+      "+r"(dst_ptr),       // %1
+      "+r"(dst_width),     // %2
+      "+r"(src_stride)     // %3
+    : "r"(&kMult38_Div6),  // %4
+      "r"(&kShuf38_2)      // %5
+    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+  );
+}
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+                          const uint8* src_ptr, ptrdiff_t src_stride,
+                          int dst_width, int source_y_fraction) {
+  asm volatile (
+    "cmp          %4, #0                       \n"
+    "beq          2f                           \n"
+    "add          %2, %1                       \n"
+    "cmp          %4, #128                     \n"
+    "beq          3f                           \n"
+
+    "vdup.8       d5, %4                       \n"
+    "rsb          %4, #256                     \n"
+    "vdup.8       d4, %4                       \n"
+    "1:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vmull.u8     q13, d0, d4                  \n"
+    "vmull.u8     q14, d1, d4                  \n"
+    "vmlal.u8     q13, d2, d5                  \n"
+    "vmlal.u8     q14, d3, d5                  \n"
+    "vrshrn.u16   d0, q13, #8                  \n"
+    "vrshrn.u16   d1, q14, #8                  \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          1b                           \n"
+    "b            4f                           \n"
+
+    "2:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "subs         %3, #16                      \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          2b                           \n"
+    "b            4f                           \n"
+
+    "3:                                        \n"
+    "vld1.u8      {q0}, [%1]!                  \n"
+    "vld1.u8      {q1}, [%2]!                  \n"
+    "subs         %3, #16                      \n"
+    "vrhadd.u8    q0, q1                       \n"
+    "vst1.u8      {q0}, [%0]!                  \n"
+    "bgt          3b                           \n"
+    "4:                                        \n"
+    "vst1.u8      {d1[7]}, [%0]                \n"
+    : "+r"(dst_ptr),          // %0
+      "+r"(src_ptr),          // %1
+      "+r"(src_stride),       // %2
+      "+r"(dst_width),        // %3
+      "+r"(source_y_fraction) // %4
+    :
+    : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  );
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@ -76,8 +76,8 @@ static int ARGBTestRotate(int src_width, int src_height,
  printf("filter %d - %8d us C - %8d us OPT\n",
         mode, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));

-  // C version may be a little off from the optimized.  Order of
-  //  operations may introduce rounding somewhere.  So do a difference
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
  //  of the buffers and look to see that the max difference isn't
  //  over 2.
  int max_diff = 0;
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@ -80,8 +80,8 @@ static int ARGBTestFilter(int src_width, int src_height,
  printf("filter %d - %8d us C - %8d us OPT\n",
         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));

-  // C version may be a little off from the optimized.  Order of
-  //  operations may introduce rounding somewhere.  So do a difference
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
  //  of the buffers and look to see that the max difference isn't
  //  over 2.
  int max_diff = 0;
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@ -118,8 +118,8 @@ static int TestFilter(int src_width, int src_height,
  printf("filter %d - %8d us C - %8d us OPT\n",
         f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));

-  // C version may be a little off from the optimized.  Order of
-  //  operations may introduce rounding somewhere.  So do a difference
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
  //  of the buffers and look to see that the max difference isn't
  //  over 2.
  int max_diff = 0;