From 413a8d8041f1cc5a350a47c0d81cc721e64f9fd0 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Fri, 12 Apr 2019 10:20:44 -0700
Subject: [PATCH] Add AYUVToNV12 and NV21ToNV12

BUG=libyuv:832
TESTED=out/Release/libyuv_unittest --gtest_filter=*ToNV12* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

R=rrwinterton@gmail.com

Change-Id: Id03b4613211fb6a6e163d10daa7c692fe31e36d8
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1560080
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
---
 README.chromium                   |   2 +-
 include/libyuv/convert.h          |  11 ++
 include/libyuv/planar_functions.h |  13 ++
 include/libyuv/row.h              |  28 ++++-
 include/libyuv/version.h          |   2 +-
 source/convert.cc                 |  73 ++++++++++-
 source/convert_argb.cc            |   6 +-
 source/planar_functions.cc        |  58 ++++++++-
 source/row_any.cc                 |   5 +-
 source/row_common.cc              |  64 ++++++++--
 source/row_gcc.cc                 |  76 +++++------
 source/row_neon.cc                | 202 +++++++++++++++++++-----------
 source/row_neon64.cc              | 199 +++++++++++++++++------------
 source/row_win.cc                 |   8 +-
 source/scale_gcc.cc               |  16 +--
 source/scale_neon.cc              |  12 +-
 source/scale_neon64.cc            |  16 +--
 unit_test/convert_test.cc         | 153 +++++++++++++++++-----
 unit_test/planar_test.cc          |  45 +++++--
 util/psnr.cc                      |   2 +-
 20 files changed, 710 insertions(+), 281 deletions(-)

diff --git a/README.chromium b/README.chromium
index 75bd2cfa1..f00b242a4 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1725
+Version: 1727
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 892a3c91b..fb8a38533 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -226,6 +226,17 @@ int UYVYToI420(const uint8_t* src_uyvy,
                int width,
                int height);
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Convert AYUV to NV21.
 LIBYUV_API
 int AYUVToNV21(const uint8_t* src_ayuv,
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 91137baba..f6f5b3edd 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -224,6 +224,19 @@ int UYVYToNV12(const uint8_t* src_uyvy,
                int width,
                int height);
 
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 LIBYUV_API
 int YUY2ToY(const uint8_t* src_yuy2,
             int src_stride_yuy2,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 8cfec20ef..9bb488506 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -332,6 +332,7 @@ extern "C" {
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
 #define HAS_AYUVTOVUROW_NEON
 #define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
@@ -375,6 +376,7 @@ extern "C" {
 #define HAS_SETROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
+#define HAS_UVToVUROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
@@ -3370,17 +3372,34 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
-
+void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
 void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToVURow_C(const uint8_t* src_ayuv, int stride_ayuv,
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
                    uint8_t* dst_vu,
                    int width);
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToVURow_NEON(const uint8_t* src_ayuv, int stride_ayuv,
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
                       uint8_t* dst_vu,
                       int width);
 void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
-void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, int stride_ayuv,
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_uv,
+                          int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
                           uint8_t* dst_vu,
                           int width);
 
@@ -4010,7 +4029,6 @@ void FloatDivToByteRow_NEON(const float* src_weights,
                             uint8_t* dst_mask,
                             int width);
 
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e6bf67e16..2b8159095 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1725
+#define LIBYUV_VERSION 1727
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index b4550685e..094d6eeed 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -880,6 +880,75 @@ int UYVYToI420(const uint8_t* src_uyvy,
   return 0;
 }
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToUVRow = AYUVToUVRow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToUVRow = AYUVToUVRow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToUVRow = AYUVToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToUVRow = AYUVToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
 // Convert AYUV to NV21.
 LIBYUV_API
 int AYUVToNV21(const uint8_t* src_ayuv,
@@ -892,8 +961,7 @@ int AYUVToNV21(const uint8_t* src_ayuv,
                int height) {
   int y;
   void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
-                      uint8_t* dst_vu, int width) =
-      AYUVToVURow_C;
+                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
   void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
       AYUVToYRow_C;
   // Negative height means invert the image.
@@ -2235,7 +2303,6 @@ int Android420ToI420(const uint8_t* src_y,
   return 0;
 }
 
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index d9660b115..ffca4ea0d 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -2008,10 +2008,8 @@ int NV21ToYUV24(const uint8_t* src_y,
                 int width,
                 int height) {
   int y;
-  void (*NV21ToYUV24Row)(const uint8_t* src_y,
-                         const uint8_t* src_vu,
-                          uint8_t* dst_yuv24,
-                          int width) = NV21ToYUV24Row_C;
+  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
   if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
     return -1;
   }
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index b49bf0a0b..9cab230f3 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -440,7 +440,6 @@ void MergeUVPlane(const uint8_t* src_u,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -504,6 +503,63 @@ void MergeUVPlane(const uint8_t* src_u,
   }
 }
 
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      UVToVURow_C;
+
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_vu = src_vu + (halfheight - 1) * src_stride_vu;
+    src_stride_y = -src_stride_y;
+    src_stride_vu = -src_stride_vu;
+  }
+  // Coalesce rows.
+  if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
+    halfheight = 1;
+    src_stride_vu = dst_stride_uv = 0;
+  }
+
+#if defined(HAS_UVToVUROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    UVToVURow = UVToVURow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      UVToVURow = UVToVURow_NEON;
+    }
+  }
+#endif
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+
+  for (y = 0; y < halfheight; ++y) {
+    UVToVURow(src_vu, dst_uv, halfwidth);
+    src_vu += src_stride_vu;
+    dst_uv += dst_stride_uv;
+  }
+  return 0;
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
diff --git a/source/row_any.cc b/source/row_any.cc
index 37bd9970f..06ca723a2 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -707,10 +707,12 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #ifdef HAS_UYVYTOYROW_MMI
 ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
 #endif
-
 #ifdef HAS_AYUVTOYROW_NEON
 ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
+#endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
@@ -1416,6 +1418,7 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
   }
 
 #ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
 ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
 #endif
 #undef ANY11S
diff --git a/source/row_common.cc b/source/row_common.cc
index 257daa6c0..8951d0037 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3236,14 +3236,13 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
                       const uint8_t* src_vu,
                       uint8_t* dst_yuv24,
                       int width) {
-
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_yuv24[0] = src_vu[0];  // V
-  	dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[1] = src_vu[1];  // U
     dst_yuv24[2] = src_y[0];   // Y0
     dst_yuv24[3] = src_vu[0];  // V
-  	dst_yuv24[4] = src_vu[1];  // U
+    dst_yuv24[4] = src_vu[1];  // U
     dst_yuv24[5] = src_y[1];   // Y1
     src_y += 2;
     src_vu += 2;
@@ -3256,6 +3255,33 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
   }
 }
 
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width) {
+  // Output a row of UV values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
 // Filter 2 rows of AYUV UV's (444) into VU (420).
 void AYUVToVURow_C(const uint8_t* src_ayuv,
                    int src_stride_ayuv,
@@ -3264,15 +3290,23 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
   // Output a row of VU values, filtering 2x2 rows of AYUV.
   int x;
   for (x = 0; x < width; x += 2) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2;
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
     src_ayuv += 8;
     dst_vu += 2;
   }
   if (width & 1) {
-    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 0] + 2) >> 2;
-    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 1] + 2) >> 2;
- }
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
 }
 
 // Copy row of AYUV Y's into Y
@@ -3280,11 +3314,23 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width; ++x) {
-    dst_y[x] = src_ayuv[2];   // v,u,y,a
+    dst_y[x] = src_ayuv[2];  // v,u,y,a
     src_ayuv += 4;
   }
 }
 
+void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t u = src_uv[0];
+    uint8_t v = src_uv[1];
+    dst_vu[0] = v;
+    dst_vu[1] = u;
+    src_uv += 2;
+    dst_vu += 2;
+  }
+}
+
 // divide values by weights and provide mask to indicate weight of 0.
 void FloatDivToByteRow_C(const float* src_weights,
                          const float* src_values,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 99d73053f..decd3d2e4 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5238,7 +5238,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );
+  );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -6669,7 +6669,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
-
 #ifdef HAS_NV21TOYUV24ROW_AVX2
 
 // begin NV21ToYUV24Row_C avx2 constants
@@ -6723,48 +6722,54 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-
   uint8_t* src_y_ptr;
   uint64_t src_offset = 0;
   uint64_t width64;
 
   width64 = width;
-  src_y_ptr = (uint8_t *) src_y;
+  src_y_ptr = (uint8_t*)src_y;
 
   asm volatile(
-      "vmovdqu     %5, %%ymm0 \n"  //init blend value
-      "vmovdqu     %6, %%ymm1 \n"  //init blend value
-      "vmovdqu     %7, %%ymm2 \n"  //init blend value
-//      "sub         $0x20, %3  \n"  //sub 32 from width for final loop
+      "vmovdqu     %5, %%ymm0 \n"  // init blend value
+      "vmovdqu     %6, %%ymm1 \n"  // init blend value
+      "vmovdqu     %7, %%ymm2 \n"  // init blend value
+      //      "sub         $0x20, %3  \n"  //sub 32 from width for final loop
 
       LABELALIGN
-      "1:                                             \n" //label 1
-      "vmovdqu     (%0,%4), %%ymm3                    \n" //src_y
-      "vmovdqu     1(%1,%4), %%ymm4                   \n" //src_uv+1
-      "vmovdqu     (%1), %%ymm5                       \n" //src_uv
-      "vpshufb     %8, %%ymm3, %%ymm13                \n" //y, kSHUF0 for shuf
-      "vpshufb     %9, %%ymm4, %%ymm14                \n" //uv+1, kSHUF1 for shuf
-      "vpshufb     %10, %%ymm5, %%ymm15               \n" //uv, kSHUF2 for shuf
-      "vpshufb     %11, %%ymm3, %%ymm3                \n" //y kSHUF3 for shuf
-      "vpshufb     %12, %%ymm4, %%ymm4                \n" //uv+1 kSHUF4 for shuf
-      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12  \n" //blend 0
-      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14  \n" //blend 0 
-      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12  \n" //blend 2
-      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13  \n" //blend 1
-      "vpshufb     %13, %%ymm5, %%ymm15               \n" //shuffle const
-      "vpor        %%ymm4, %%ymm3, %%ymm5             \n" //get results
-      "vmovdqu     %%ymm12, 0x20(%2)                  \n" //store dst_yuv+20h
-      "vpor        %%ymm15, %%ymm5, %%ymm3            \n" //get results
-      "add         $0x20, %4                          \n" //add to src buffer ptr
-      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4      \n" //insert
-      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5     \n" //insert
-      "vmovdqu     %%ymm4, (%2)                       \n" //store dst_yuv
-      "vmovdqu     %%ymm5, 0x40(%2)                   \n" //store dst_yuv+40h
-      "add         $0x60,%2                           \n" //add to dst buffer ptr
-//      "cmp         %3, %4                             \n" //(width64 - 32 bytes) and src_offset
-      "sub         $0x20,%3                           \n" // 32 pixels per loop
+      "1:                                             \n"  // label 1
+      "vmovdqu     (%0,%4), %%ymm3                    \n"  // src_y
+      "vmovdqu     1(%1,%4), %%ymm4                   \n"  // src_uv+1
+      "vmovdqu     (%1), %%ymm5                       \n"  // src_uv
+      "vpshufb     %8, %%ymm3, %%ymm13                \n"  // y, kSHUF0 for shuf
+      "vpshufb     %9, %%ymm4, %%ymm14                \n"  // uv+1, kSHUF1 for
+                                                           // shuf
+      "vpshufb     %10, %%ymm5, %%ymm15               \n"  // uv, kSHUF2 for
+                                                           // shuf
+      "vpshufb     %11, %%ymm3, %%ymm3                \n"  // y kSHUF3 for shuf
+      "vpshufb     %12, %%ymm4, %%ymm4                \n"  // uv+1 kSHUF4 for
+                                                           // shuf
+      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12  \n"  // blend 0
+      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14  \n"  // blend 0
+      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12  \n"  // blend 2
+      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13  \n"  // blend 1
+      "vpshufb     %13, %%ymm5, %%ymm15               \n"  // shuffle const
+      "vpor        %%ymm4, %%ymm3, %%ymm5             \n"  // get results
+      "vmovdqu     %%ymm12, 0x20(%2)                  \n"  // store dst_yuv+20h
+      "vpor        %%ymm15, %%ymm5, %%ymm3            \n"  // get results
+      "add         $0x20, %4                          \n"  // add to src buffer
+                                                           // ptr
+      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4      \n"  // insert
+      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5     \n"  // insert
+      "vmovdqu     %%ymm4, (%2)                       \n"  // store dst_yuv
+      "vmovdqu     %%ymm5, 0x40(%2)                   \n"  // store dst_yuv+40h
+      "add         $0x60,%2                           \n"  // add to dst buffer
+                                                           // ptr
+      //      "cmp         %3, %4                             \n" //(width64 -
+      //      32 bytes) and src_offset
+      "sub         $0x20,%3                           \n"  // 32 pixels per loop
       "jg          1b                                 \n"
-      "vzeroupper                                     \n" //sse-avx2 transistions
+      "vzeroupper                                     \n"  // sse-avx2
+                                                           // transistions
 
       : "+r"(src_y),      //%0
         "+r"(src_vu),     //%1
@@ -6780,7 +6785,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
         "m"(kSHUF3),      //%11
         "m"(kSHUF4),      //%12
         "m"(kSHUF5)       //%13
-      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", "xmm13", "xmm14", "xmm15");
+      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+        "xmm13", "xmm14", "xmm15");
 }
 #endif  // HAS_NV21TOYUV24ROW_AVX2
 
diff --git a/source/row_neon.cc b/source/row_neon.cc
index e440209fc..baf57495c 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -561,7 +561,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -582,7 +582,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -607,7 +607,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "d0", "d1", "d2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -632,7 +632,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
@@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -761,7 +761,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -778,7 +778,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -795,7 +795,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -826,7 +826,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
         "+r"(width)        // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -872,7 +872,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -901,7 +901,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -919,7 +919,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -935,7 +935,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -950,7 +950,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -965,7 +965,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -985,7 +985,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1005,7 +1005,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1032,7 +1032,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1059,7 +1059,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1081,7 +1081,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1241,7 +1241,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -2564,7 +2564,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2),                     // %5
         "r"(6)                      // %6
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2601,7 +2601,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1),                     // %4
         "r"(6)                      // %5
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // %y passes a float as a scalar vector for vector * scalar multiply.
@@ -2690,70 +2690,120 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
                          uint8_t* dst_yuv24,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    "vld1.8    {q2}, [%0]!                     \n"  // load 16 Y values
-    "vld2.8    {d0, d2}, [%1]!                 \n"  // load 8 VU values
-    "vmov      d1, d0                          \n"
-    "vzip.u8   d0, d1                          \n"  // VV
-    "vmov      d3, d2                          \n"
-    "vzip.u8   d2, d3                          \n"  // UU
-    "subs      %3, %3, #16                     \n"  // 16 pixels per loop
-    "vst3.8    {d0, d2, d4}, [%2]!             \n"  // store 16 YUV pixels
-    "vst3.8    {d1, d3, d5}, [%2]!             \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_vu),     // %1
-      "+r"(dst_yuv24),  // %2
-      "+r"(width)       // %3
-    :
-    : "cc", "memory", "q0", "q1", "q2");
+  asm volatile(
+      "1:                                          \n"
+      "vld1.8    {q2}, [%0]!                     \n"  // load 16 Y values
+      "vld2.8    {d0, d2}, [%1]!                 \n"  // load 8 VU values
+      "vmov      d1, d0                          \n"
+      "vzip.u8   d0, d1                          \n"  // VV
+      "vmov      d3, d2                          \n"
+      "vzip.u8   d2, d3                          \n"  // UU
+      "subs      %3, %3, #16                     \n"  // 16 pixels per loop
+      "vst3.8    {d0, d2, d4}, [%2]!             \n"  // store 16 YUV pixels
+      "vst3.8    {d1, d3, d5}, [%2]!             \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d0, q1, #2                   \n"
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels UV.
+      "bgt        1b                             \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_uv),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 }
 
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
                       int src_stride_ayuv,
                       uint8_t* dst_vu,
                       int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
-    "1:                                        \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels.
-    "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV pixels.
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
-    "vqrshrun.s16 d1, q1, #2                   \n"
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels VU.
-    "bgt        1b                             \n"
-  : "+r"(src_ayuv),        // %0
-    "+r"(src_stride_ayuv), // %1
-    "+r"(dst_vu),          // %2
-    "+r"(width)            // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
-  );
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d1, q1, #2                   \n"
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels VU.
+      "bgt        1b                             \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_vu),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
 }
 
 // Copy row of AYUV Y's into Y.
 // Similar to ARGBExtractAlphaRow_NEON
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vst1.8     {q2}, [%1]!                    \n"  // store 16 Y's.
-    "bgt       1b                              \n"
-   : "+r"(src_ayuv),   // %0
-      "+r"(dst_y),     // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "q0", "q1", "q2", "q3");
+  asm volatile(
+      "1:                                          \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q2}, [%1]!                    \n"  // store 16 Y's.
+      "bgt       1b                              \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                          \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 UV values
+      "vld2.8     {d1, d3}, [%0]!                \n"
+      "vorr.u8    q2, q0, q0                     \n"  // move U after V
+      "subs       %2, %2, #16                    \n"  // 16 pixels per loop
+      "vst2.8     {q1, q2}, [%1]!                \n"  // store 16 VU pixels
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2");
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 5d045f645..449c9f394 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.
@@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
         "+r"(width)       // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
         "+r"(width)        // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
         "v28"
 
-      );
+  );
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2LL),                               // %5
         "r"(6LL)                                // %6
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1LL),                               // %4
         "r"(6LL)                                // %5
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Caveat - rounds float to half float whereas scaling version truncates.
@@ -2879,23 +2879,51 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
 // Convert biplanar NV21 to packed YUV24
 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
                          const uint8_t* src_vu,
-                          uint8_t* dst_yuv24,
-                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    "ld1        {v2.16b}, [%0], #16            \n"  // load 16 Y values
-    "ld2        {v0.8b, v1.8b}, [%1], #16      \n"  // load 8 VU values
-    "zip1       v0.16b, v0.16b, v0.16b         \n"  // replicate V values
-    "zip1       v1.16b, v1.16b, v1.16b         \n"  // replicate U values
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels per loop
-    "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
-    "b.gt       1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_vu),     // %1
-      "+r"(dst_yuv24),  // %2
-      "+r"(width)       // %3
-    :
-    : "cc", "memory", "v0", "v1", "v2");
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                          \n"
+      "ld1        {v2.16b}, [%0], #16            \n"     // load 16 Y values
+      "ld2        {v0.8b, v1.8b}, [%1], #16      \n"     // load 8 VU values
+      "zip1       v0.16b, v0.16b, v0.16b         \n"     // replicate V values
+      "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
+      "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels.
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v2.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
@@ -2905,40 +2933,41 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
   asm volatile(
 
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
-    "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
-    "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
-    "uqrshrn    v1.8b, v1.8h, #2               \n"
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
-    "b.gt       1b                             \n"
-  : "+r"(src_ayuv),  // %0
-    "+r"(src_ayuv_1),  // %1
-    "+r"(dst_vu),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels.
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Copy row of AYUV Y's into Y
 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels
-    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-    "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
-    "b.gt       1b                             \n"
-    : "+r"(src_ayuv),   // %0
-      "+r"(dst_y),      // %1
-      "+r"(width)       // %2
-    :
-    : "cc", "memory", "v0", "v1", "v2", "v3");
+  asm volatile(
+      "1:                                          \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 void FloatDivToByteRow_NEON(const float* src_weights,
@@ -2962,7 +2991,7 @@ void FloatDivToByteRow_NEON(const float* src_weights,
       "uqxtn      v1.4h, v1.4s                   \n"  // 8 shorts
       "uqxtn2     v1.8h, v2.4s                   \n"
       "uqxtn      v1.8b, v1.8h                   \n"  // 8 bytes
- 
+
       "st1        {v1.8b}, [%2], #8              \n"  // store 8 byte out
 
       "fcmgt      v5.4s, v1.4s, v0.4s            \n"  // cmp weight to zero
@@ -2974,15 +3003,31 @@ void FloatDivToByteRow_NEON(const float* src_weights,
       "st1        {v5.8b}, [%3], #8              \n"  // store 8 byte mask
 
       "b.gt       1b                             \n"
-      : "+r"(src_weights), // %0
-        "+r"(src_values),  // %1
-        "+r"(dst_out),     // %2
-        "+r"(dst_mask),    // %3
-        "+r"(width)        // %4
+      : "+r"(src_weights),  // %0
+        "+r"(src_values),   // %1
+        "+r"(dst_out),      // %2
+        "+r"(dst_mask),     // %3
+        "+r"(width)         // %4
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
+// Convert biplanar UV channel of NV12 to NV21
+void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                          \n"
+      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
+      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/source/row_win.cc b/source/row_win.cc
index 5500d7f5a..4484112c3 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -4222,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-        // 1 pixel loop.
+            // 1 pixel loop.
   convertloop1:
     movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
@@ -5360,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5448,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
@@ -5534,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     cvttps2dq  xmm0, xmm2  // x, y float to int
     packssdw   xmm0, xmm0  // x, y as shorts
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 312236d2d..90a49f30d 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -483,7 +483,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf0),  // %0
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -521,7 +521,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
       "movdqa    %0,%%xmm5                       \n"  // kMadd01
       "movdqa    %1,%%xmm0                       \n"  // kMadd11
@@ -530,7 +530,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -587,7 +587,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
       "movdqa    %0,%%xmm5                       \n"  // kMadd01
       "movdqa    %1,%%xmm0                       \n"  // kMadd11
@@ -596,7 +596,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
 
   asm volatile(
 
@@ -690,7 +690,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb1),  // %1
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -734,7 +734,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShufAc),    // %0
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -1272,7 +1272,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
       :
       : "m"(kShuffleColARGB),   // %0
         "m"(kShuffleFractions)  // %1
-      );
+  );
 
   asm volatile(
       "movd      %5,%%xmm2                       \n"
diff --git a/source/scale_neon.cc b/source/scale_neon.cc
index 46f5ba4cd..366b155ba 100644
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@@ -40,7 +40,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -61,7 +61,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -92,7 +92,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -523,7 +523,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
         "+r"(src_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -705,7 +705,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 //  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
@@ -734,7 +734,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index f4aed5fc9..0a7b80ce1 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -38,7 +38,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -60,7 +60,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -89,7 +89,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -534,7 +534,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
         "+r"(src_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -719,7 +719,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
@@ -742,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -991,7 +991,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Read 8x2 upsample with filtering and write 16x1.
@@ -1041,7 +1041,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
         "r"(14LL)          // %5
       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
         "v19"  // Clobber List
-      );
+  );
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 0fc6f873e..2e1670f2f 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -311,10 +311,10 @@ int I400ToNV21(const uint8_t* src_y,
                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
                                      OFF);                                    \
     align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *        \
+    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *        \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
     align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *      \
+    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *      \
                                           SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kWidth; ++j)                                        \
@@ -329,21 +329,21 @@ int I400ToNV21(const uint8_t* src_y,
     }                                                                         \
     memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_uv_c, 2,                                                       \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_uv_opt, 102,                                                   \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
         src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
         src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth,       \
-        dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight);     \
+        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);     \
     MaskCpuFlags(benchmark_cpu_info_);                                        \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
       SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
           src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
           src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth,   \
-          dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
     }                                                                         \
     int max_diff = 0;                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
@@ -357,12 +357,12 @@ int I400ToNV21(const uint8_t* src_y,
     }                                                                         \
     EXPECT_LE(max_diff, 1);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {            \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {            \
         int abs_diff =                                                        \
             abs(static_cast<int>(                                             \
-                    dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -     \
+                    dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -     \
                 static_cast<int>(                                             \
-                    dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));   \
+                    dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));   \
         if (abs_diff > max_diff) {                                            \
           max_diff = abs_diff;                                                \
         }                                                                     \
@@ -395,6 +395,99 @@ TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
 TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
 TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
 
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
+                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG,     \
+                          OFF)                                                 \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {               \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
+    const int kHeight = benchmark_height_;                                     \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                       \
+    align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 *       \
+                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
+                                      OFF);                                    \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                           \
+    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *         \
+                                        SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                         \
+    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *       \
+                                          SUBSAMPLE(kHeight, SUBSAMP_Y));      \
+    for (int i = 0; i < kHeight; ++i)                                          \
+      for (int j = 0; j < kWidth; ++j)                                         \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                     \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {              \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {             \
+        src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] =     \
+            (fastrand() & 0xff);                                               \
+        src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] =     \
+            (fastrand() & 0xff);                                               \
+      }                                                                        \
+    }                                                                          \
+    memset(dst_y_c, 1, kWidth* kHeight);                                       \
+    memset(dst_uv_c, 2,                                                        \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));  \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                   \
+    memset(dst_uv_opt, 102,                                                    \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y));  \
+    MaskCpuFlags(disable_cpu_flags_);                                          \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                            \
+        src_y + OFF, kWidth, src_uv + OFF,                                     \
+        SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c,       \
+        SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);                \
+    MaskCpuFlags(benchmark_cpu_info_);                                         \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                          \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                          \
+          src_y + OFF, kWidth, src_uv + OFF,                                   \
+          SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \
+          SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);              \
+    }                                                                          \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kHeight; ++i) {                                        \
+      for (int j = 0; j < kWidth; ++j) {                                       \
+        int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -         \
+                           static_cast<int>(dst_y_opt[i * kWidth + j]));       \
+        if (abs_diff > max_diff) {                                             \
+          max_diff = abs_diff;                                                 \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, 1);                                                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                  \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {             \
+        int abs_diff =                                                         \
+            abs(static_cast<int>(                                              \
+                    dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -      \
+                static_cast<int>(                                              \
+                    dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));    \
+        if (abs_diff > max_diff) {                                             \
+          max_diff = abs_diff;                                                 \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, 1);                                                    \
+    free_aligned_buffer_page_end(dst_y_c);                                     \
+    free_aligned_buffer_page_end(dst_uv_c);                                    \
+    free_aligned_buffer_page_end(dst_y_opt);                                   \
+    free_aligned_buffer_page_end(dst_uv_opt);                                  \
+    free_aligned_buffer_page_end(src_y);                                       \
+    free_aligned_buffer_page_end(src_uv);                                      \
+  }
+
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,        \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                    \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)       \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0)   \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1)  \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0)    \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
+
+// TODO(fbarchard): Fix msan on this unittest
+// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
                          DOY)                                                  \
@@ -680,8 +773,8 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
 
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,\
-                         W1280, DIFF, N, NEG, OFF)                             \
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
+                         BPP_B, W1280, DIFF, N, NEG, OFF)                      \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = benchmark_height_;                                     \
@@ -740,15 +833,15 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
-                        BPP_B, DIFF)                                          \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0)                    \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   benchmark_width_, DIFF, _Unaligned, +, 1)                  \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
-                   benchmark_width_, DIFF, _Invert, -, 0)                     \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,     \
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+                        DIFF)                                                  \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, DIFF, _Invert, -, 0)                      \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
                    benchmark_width_, DIFF, _Opt, +, 0)
 
 TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
@@ -980,6 +1073,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
 TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
 TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 
 #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
@@ -1378,14 +1472,15 @@ TEST_F(LibYUVConvertTest, FuzzJpeg) {
     orig_pixels[0] = 0xff;
     orig_pixels[1] = 0xd8;  // SOI.
     orig_pixels[kSize - 1] = 0xff;
-    ValidateJpeg(orig_pixels, kSize);  // Failure normally expected.
+    ValidateJpeg(orig_pixels,
+                 kSize);  // Failure normally expected.
     free_aligned_buffer_page_end(orig_pixels);
   }
 }
 
-// Test data created in GIMP.  In export jpeg, disable thumbnails etc,
-// choose a subsampling, and use low quality (50) to keep size small.
-// Generated with xxd -i test.jpg
+// Test data created in GIMP.  In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
 // test 0 is J400
 static const uint8_t kTest0Jpg[] = {
     0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
@@ -1987,8 +2082,8 @@ TEST_F(LibYUVConvertTest, TestMJPGInfo) {
   EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
   EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
   EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
-  EXPECT_EQ(1,
-            ShowJPegInfo(kTest4Jpg, kTest4JpgLen));  // Valid but unsupported.
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+                            kTest4JpgLen));  // Valid but unsupported.
 }
 #endif  // HAVE_JPEG
 
@@ -2906,7 +3001,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAR30) {
   const int kSize = 1024;
   int histogram_b[1024];
@@ -2969,7 +3065,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAB30) {
   const int kSize = 1024;
   int histogram_b[1024];
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index fcd073a2a..3a5029065 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -3268,10 +3268,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
 }
 
 float TestFloatDivToByte(int benchmark_width,
-                       int benchmark_height,
-                       int benchmark_iterations,
-                       float scale,
-                       bool opt) {
+                         int benchmark_height,
+                         int benchmark_iterations,
+                         float scale,
+                         bool opt) {
   int i, j;
   // NEON does multiple of 8, so round count up
   const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
@@ -3287,7 +3287,8 @@ float TestFloatDivToByte(int benchmark_width,
   // large values are problematic.  audio is really -1 to 1.
   for (i = 0; i < kPixels; ++i) {
     (reinterpret_cast<float*>(src_weights))[i] = scale;
-    (reinterpret_cast<float*>(src_values))[i] = sinf(static_cast<float>(i) * 0.1f);
+    (reinterpret_cast<float*>(src_values))[i] =
+        sinf(static_cast<float>(i) * 0.1f);
   }
   memset(dst_out_c, 0, kPixels);
   memset(dst_out_opt, 1, kPixels);
@@ -3295,24 +3296,24 @@ float TestFloatDivToByte(int benchmark_width,
   memset(dst_mask_opt, 3, kPixels);
 
   FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                      reinterpret_cast<float*>(src_values),
-                      dst_out_c, dst_mask_c, kPixels);
+                      reinterpret_cast<float*>(src_values), dst_out_c,
+                      dst_mask_c, kPixels);
 
   for (j = 0; j < benchmark_iterations; j++) {
     if (opt) {
 #ifdef HAS_FLOATDIVTOBYTEROW_NEON
       FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
-                             reinterpret_cast<float*>(src_values),
-                              dst_out_opt, dst_mask_opt, kPixels);
+                             reinterpret_cast<float*>(src_values), dst_out_opt,
+                             dst_mask_opt, kPixels);
 #else
       FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                          reinterpret_cast<float*>(src_values),
-                           dst_out_opt, dst_mask_opt, kPixels);
+                          reinterpret_cast<float*>(src_values), dst_out_opt,
+                          dst_mask_opt, kPixels);
 #endif
     } else {
       FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
-                          reinterpret_cast<float*>(src_values),
-                           dst_out_opt, dst_mask_opt, kPixels);
+                          reinterpret_cast<float*>(src_values), dst_out_opt,
+                          dst_mask_opt, kPixels);
     }
   }
 
@@ -3347,5 +3348,23 @@ TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
   EXPECT_EQ(0, diff);
 }
 
+TEST_F(LibYUVPlanarTest, UVToVURow) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_vu, kPixels * 2);
+  align_buffer_page_end(dst_pixels_uv, kPixels * 2);
+
+  MemRandomize(src_pixels_vu, kPixels * 2);
+  memset(dst_pixels_uv, 1, kPixels * 2);
+
+  UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_vu);
+  free_aligned_buffer_page_end(dst_pixels_uv);
+}
 
 }  // namespace libyuv
diff --git a/util/psnr.cc b/util/psnr.cc
index f54015bab..c7bee7f97 100644
--- a/util/psnr.cc
+++ b/util/psnr.cc
@@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );  // NOLINT
+  );  // NOLINT
   return sse;
 }
 #endif  // LIBYUV_DISABLE_X86 etc