BT.2020 Full Range yuvconstants

new color util to compute constants needed based on white point. [ RUN ] LibYUVColorTest.TestFullYUVV hist -2 -1 0 1 2 red 0 1627136 13670144 1479936 0 green 319285 3456836 9243059 3440771 317265 blue 0 1561088 14202112 1014016 0 Bug: libyuv:877, b/178283356 Change-Id: If432ebfab76b01302fdb416a153c4f26ca0832d6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2678859 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-06 08:46:47 +08:00 · 2021-02-05 16:14:25 -08:00 · 2021-02-05 16:14:25 -08:00 · 942c508448
commit 942c508448
parent 60d37a064b
15 changed files with 733 additions and 571 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1774
+Version: 1775
 License: BSD
 License File: LICENSE

--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -213,7 +213,7 @@ int I010ToI410(const uint16_t* src_y,
 // Convert I012 to I412
 #define I012ToI412 I010ToI410

-// Convert I212 to I412
+// Convert I210 to I410
 LIBYUV_API
 int I210ToI410(const uint16_t* src_y,
               int src_stride_y,
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@ -21,18 +21,20 @@ extern "C" {
 #endif

 // Conversion matrix for YUV to RGB
-LIBYUV_API extern const struct YuvConstants kYuvI601Constants;  // BT.601
-LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;  // JPeg
-LIBYUV_API extern const struct YuvConstants kYuvF709Constants;  // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYuvH709Constants;  // BT.709
-LIBYUV_API extern const struct YuvConstants kYuv2020Constants;  // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants;   // JPeg
+LIBYUV_API extern const struct YuvConstants kYuvF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYuvH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYuv2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYuvV2020Constants;  // BT.2020 full

 // Conversion matrix for YVU to BGR
-LIBYUV_API extern const struct YuvConstants kYvuI601Constants;  // BT.601
-LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;  // JPeg
-LIBYUV_API extern const struct YuvConstants kYvuF709Constants;  // BT.709 full
-LIBYUV_API extern const struct YuvConstants kYvuH709Constants;  // BT.709
-LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuI601Constants;   // BT.601
+LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants;   // JPeg
+LIBYUV_API extern const struct YuvConstants kYvuF709Constants;   // BT.709 full
+LIBYUV_API extern const struct YuvConstants kYvuH709Constants;   // BT.709
+LIBYUV_API extern const struct YuvConstants kYvu2020Constants;   // BT.2020
+LIBYUV_API extern const struct YuvConstants kYvuV2020Constants;  // BT.2020 full

 // Macros for end swapped destination Matrix conversions.
 // Swap UV and pass mirrored kYvuJPEGConstants matrix.
@ -42,6 +44,8 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants;  // BT.2020
 #define kYuvF709ConstantsVU kYvuF709Constants
 #define kYuvH709ConstantsVU kYvuH709Constants
 #define kYuv2020ConstantsVU kYvu2020Constants
+#define kYuvV2020ConstantsVU kYvuV2020Constants
+
 #define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
  NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
 #define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1774
+#define LIBYUV_VERSION 1775

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -30,6 +30,8 @@ static __inline int Abs(int v) {
 }

 // I420 To any I4xx YUV format with mirroring.
+// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
+
 static int I420ToI4xx(const uint8_t* src_y,
                      int src_stride_y,
                      const uint8_t* src_u,
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1330,234 +1330,218 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // Macros to create SIMD specific yuv to rgb conversion constants.

 #if defined(__aarch64__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
-      {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},                     \
-      {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},                     \
-      {UG, VG, UG, VG, UG, VG, UG, VG},                             \
-      {UG, VG, UG, VG, UG, VG, UG, VG},                             \
-      {BB, BG, BR, YGB, 0, 0, 0, 0},                                \
-      {0x0101 * YG, YG, 0, 0}};                                     \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
-      {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},                     \
-      {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},                     \
-      {VG, UG, VG, UG, VG, UG, VG, UG},                             \
-      {VG, UG, VG, UG, VG, UG, VG, UG},                             \
-      {BR, BG, BB, YGB, 0, 0, 0, 0},                                \
-      {0x0101 * YG, YG, 0, 0}};
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)        \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {       \
+      {UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
+      {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
+      {BB, BG, BR, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};         \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {       \
+      {VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
+      {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
+      {BR, BG, BB, YB, 0, 0, 0, 0},     {0x0101 * YG, YG, 0, 0}};

 #elif defined(__arm__)
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR)     \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {     \
-      {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
-      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},         \
-      {BB, BG, BR, YGB, 0, 0, 0, 0},                                    \
-      {0x0101 * YG, YG, 0, 0}};                                         \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {     \
-      {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
-      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},         \
-      {BR, BG, BB, YGB, 0, 0, 0, 0},                                    \
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)  \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
+      {UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {BB, BG, BR, YB, 0, 0, 0, 0},                                 \
+      {0x0101 * YG, YG, 0, 0}};                                     \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
+      {VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},     \
+      {BR, BG, BB, YB, 0, 0, 0, 0},                                 \
      {0x0101 * YG, YG, 0, 0}};

 #else
-#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR)          \
-  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {          \
-      {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,               \
-       UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},              \
-      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,       \
-       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},      \
-      {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,               \
-       0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},              \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},      \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},      \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},      \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},      \
-      {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
-       YGB, YGB}};                                                           \
-  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {          \
-      {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,               \
-       VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},              \
-      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,       \
-       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},      \
-      {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,               \
-       0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},              \
-      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},      \
-      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},      \
-      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},      \
-      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},      \
-      {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
-       YGB, YGB}};
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR)       \
+  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = {      \
+      {-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0,   \
+       -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0},  \
+      {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,   \
+       UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},  \
+      {0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR,   \
+       0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR},  \
+      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
+      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
+      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
+      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
+      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
+  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = {      \
+      {-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0,   \
+       -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0},  \
+      {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,   \
+       VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},  \
+      {0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB,   \
+       0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB},  \
+      {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},  \
+      {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},  \
+      {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},  \
+      {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG},  \
+      {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
 #endif

 // TODO(fbarchard): Generate SIMD structures from float matrix.

-// BT.601 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.596
-//  G = (Y - 16) * 1.164 - U *  0.391 - V *  0.813
-//  B = (Y - 16) * 1.164 - U * -2.018
+// Bias values to round, and subtract 128 from U and V.
+#define BB (-UB * 128 + YB)
+#define BG (UG * 128 + VG * 128 + YB)
+#define BR (-VR * 128 + YB)

-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.601 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.596
+//  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
+//  B = (Y - 16) * 1.164 + U * 2.018
+// KR = 0.299; KB = 0.114

 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25   /* round(0.391 * 64) */
-#define VG 52   /* round(0.813 * 64) */
-#define VR -102 /* round(-1.596 * 64) */
+#define UB 128 /* max(128, round(2.018 * 64)) */
+#define UG 25  /* round(0.391 * 64) */
+#define VG 52  /* round(0.813 * 64) */
+#define VR 102 /* round(1.596 * 64) */

-// Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(I601, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)

-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG

-// JPEG YUV to RGB reference
-// *  R = Y                - V * -1.40200
-// *  G = Y - U *  0.34414 - V *  0.71414
-// *  B = Y - U * -1.77200
+// BT.601 full range YUV to RGB reference (aka JPEG)
+// *  R = Y               + V * 1.40200
+// *  G = Y - U * 0.34414 - V * 0.71414
+// *  B = Y + U * 1.77200
+// KR = 0.299; KB = 0.114
+
+// U and V contributions to R,G,B.
+#define UB 113 /* round(1.77200 * 64) */
+#define UG 22  /* round(0.34414 * 64) */
+#define VG 46  /* round(0.71414 * 64) */
+#define VR 90  /* round(1.40200 * 64) */

 // Y contribution to R,G,B.  Scale and bias.
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32   /* 64 / 2 */
+#define YB 32    /* 64 / 2 */

-// U and V contributions to R,G,B.
-#define UB -113 /* round(-1.77200 * 64) */
-#define UG 22   /* round(0.34414 * 64) */
-#define VG 46   /* round(0.71414  * 64) */
-#define VR -90  /* round(-1.40200 * 64) */
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)

-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-MAKEYUVCONSTANTS(JPEG, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG

-// BT.709 YUV to RGB reference
-//  R = (Y - 16) * 1.164              - V * -1.793
-//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
-//  B = (Y - 16) * 1.164 - U * -2.112
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
+// BT.709 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164             + V * 1.793
+//  G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
+//  B = (Y - 16) * 1.164 + U * 2.112
+//  KR = 0.2126, KB = 0.0722

 // TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.112 * 64)) */
-#define UG 14   /* round(0.213 * 64) */
-#define VG 34   /* round(0.533  * 64) */
-#define VR -115 /* round(-1.793 * 64) */
+#define UB 128 /* max(128, round(2.112 * 64)) */
+#define UG 14  /* round(0.213 * 64) */
+#define VG 34  /* round(0.533 * 64) */
+#define VR 115 /* round(1.793 * 64) */

-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(H709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)

-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG

 // BT.709 full range YUV to RGB reference
-//  R = Y                - V * -1.5748
-//  G = Y - U *  0.18732 - V *  0.46812
-//  B = Y - U * -1.8556
-//  WR = 0.2126
-//  WB = 0.0722
-//  WR and WB given, the equations are:
-//  R = Y + (2 * (1 - WR)) * V;
-//  G = Y - ((2 * ((WR * (1 - WR) * V) + (WB * (1 - WB) * U))) / (1 - WB - WR));
-//  B = Y + (2 * (1 - WB)) * U;
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126, KB = 0.0722
+
+// U and V contributions to R,G,B.
+#define UB 119 /* round(1.8556 * 64) */
+#define UG 12  /* round(0.18732 * 64) */
+#define VG 30  /* round(0.46812 * 64) */
+#define VR 101 /* round(1.5748 * 64) */

 // Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
-#define YGB 32   /* 64 / 2 */
+#define YB 32    /* 64 / 2 */

-// U and V contributions to R,G,B.
-#define UB -119 /* round(-1.8556 * 64) */
-#define UG 12   /* round(0.18732 * 64) */
-#define VG 30   /* round(0.46812 * 64) */
-#define VR -101 /* round(-1.5748 * 64) */
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)

-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
-
-MAKEYUVCONSTANTS(F709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
-
-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
-#undef YG

-// BT.2020 YUV to RGB reference
-//  R = (Y - 16) * 1.164384                - V * -1.67867
-//  G = (Y - 16) * 1.164384 - U * 0.187326 - V *  0.65042
-//  B = (Y - 16) * 1.164384 - U * -2.14177
-
-// Y contribution to R,G,B.  Scale and bias.
-#define YG 19003  /* round(1.164384 * 64 * 256 * 256 / 257) */
-#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
+// BT.2020 limited range YUV to RGB reference
+//  R = (Y - 16) * 1.164384                + V * 1.67867
+//  G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
+//  B = (Y - 16) * 1.164384 + U * 2.14177
+// KR = 0.2627; KB = 0.0593

 // TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.142 * 64)) */
-#define UG 12   /* round(0.187326 * 64) */
-#define VG 42   /* round(0.65042 * 64) */
-#define VR -107 /* round(-1.67867 * 64) */
+#define UB 128 /* max(128, round(2.142 * 64)) */
+#define UG 12  /* round(0.187326 * 64) */
+#define VG 42  /* round(0.65042 * 64) */
+#define VR 107 /* round(1.67867 * 64) */

-// Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128 + YGB)
-#define BG (UG * 128 + VG * 128 + YGB)
-#define BR (VR * 128 + YGB)
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
+#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(2020, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)

-#undef BB
-#undef BG
-#undef BR
-#undef YGB
+#undef YG
+#undef YB
 #undef UB
 #undef UG
 #undef VG
 #undef VR
+
+// BT.2020 full range YUV to RGB reference
+//  R = Y                + V * 1.474600
+//  G = Y - U * 0.164553 - V * 0.571353
+//  B = Y + U * 1.881400
+// KR = 0.2627; KB = 0.0593
+
+#define UB 120 /* round(1.881400 * 64) */
+#define UG 11  /* round(0.164553 * 64) */
+#define VG 37  /* round(0.571353 * 64) */
+#define VR 94  /* round(1.474600 * 64) */
+
+// Y contribution to R,G,B.  Scale and bias.  (same as jpeg)
+#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
+#define YB 32    /* 64 / 2 */
+
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+
 #undef YG
+#undef YB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+
+#undef BB
+#undef BG
+#undef BR

 #undef MAKEYUVCONSTANTS

--- a/source/scale.cc
+++ b/source/scale.cc
@ -1336,7 +1336,7 @@ void ScalePlaneBilinearUp(int src_width,
  }
 }

-// Scale plane, horizontally 2 times, vertically any time.
+// Scale plane, horizontally up by 2 times.
 // Uses linear filter horizontally, nearest vertically.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
@ -1356,7 +1356,7 @@ void ScalePlaneUp2_Linear(int src_width,
  int dy;

  // This function can only scale up by 2 times horizontally.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));

 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
  if (TestCpuFlag(kCpuHasSSE2)) {
@ -1396,7 +1396,7 @@ void ScalePlaneUp2_Linear(int src_width,
  }
 }

-// Scale plane, 2 times.
+// Scale plane, up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // This is used to scale U and V planes of I420 to I444.
@ -1414,7 +1414,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
  int x;

  // This function can only scale up by 2 times.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);

 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
@ -1449,7 +1449,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
    for (x = 0; x < src_height - 1; ++x) {
      Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
      src_ptr += src_stride;
-      // TODO test performance of writing one row of destination at a time
+      // TODO: Test performance of writing one row of destination at a time.
      dst_ptr += 2 * dst_stride;
    }
    if (!(dst_height & 1)) {
@ -1458,7 +1458,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
  }
 }

-// Scale at most 14bit plane, horizontally 2 times.
+// Scale at most 14 bit plane, horizontally up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original width, using linear interpolation.
 // stride is in count of uint16_t.
@ -1478,7 +1478,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
  int dy;

  // This function can only scale up by 2 times horizontally.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));

 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
  if (TestCpuFlag(kCpuHasSSE2)) {
@ -1512,7 +1512,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
  }
 }

-// Scale at most 12bit plane, up 2 times.
+// Scale at most 12 bit plane, up by 2 times.
 // This is an optimized version for scaling up a plane to 2 times of
 // its original size, using bilinear interpolation.
 // stride is in count of uint16_t.
@ -1531,7 +1531,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
  int x;

  // This function can only scale up by 2 times.
-  assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
+  assert(src_width == ((dst_width + 1) / 2));
  assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);

 #ifdef HAS_SCALEROWUP2LINEAR_SSE2
--- a/source/scale_any.cc
+++ b/source/scale_any.cc
@ -625,7 +625,7 @@ CANY(ScaleARGBFilterCols_Any_MSA,
    dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1];         \
  }

-// Even the C version need to be wrapped, because boundary pixels have to
+// Even the C versions need to be wrapped, because boundary pixels have to
 // be handled differently

 SUH2LANY(ScaleRowUp2_Linear_Any_C,
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -400,7 +400,7 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
  }
 }

-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
 //
 //      v dst_ptr at here           v stop at here
 //  X O X   X O X   X O X   X O X   X O X
@ -417,7 +417,7 @@ void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
  }
 }

-// sample position: (O is src sample position, X is dst sample position)
+// Sample position: (O is src sample position, X is dst sample position)
 //
 //    src_ptr at here
 //  X v X   X   X   X   X   X   X   X   X
@ -451,7 +451,7 @@ void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
  }
 }

-// only suitable for at most 14bit range.
+// Only suitable for at most 14 bit range.
 void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
                             uint16_t* dst_ptr,
                             int dst_width) {
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -197,7 +197,6 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                        int dst_width) {
  (void)src_stride;
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
@ -485,7 +484,6 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf2)   // %2
  );
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -532,7 +530,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kRound34)  // %2
  );
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
@ -599,7 +596,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
  );

  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
@ -692,7 +688,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kScaleAb2)  // %3
  );
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -736,7 +731,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kScaleAc33)  // %2
  );
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -790,7 +784,6 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
  asm volatile(
-
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
@ -847,7 +840,6 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
@ -962,7 +954,6 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  asm volatile(
-
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
@ -1015,7 +1006,6 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
  asm volatile(
-
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      "pcmpeqw     %%xmm7,%%xmm7                 \n"
      "psrlw       $15,%%xmm7                    \n"
@ -1124,29 +1114,28 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
  asm volatile(
-
-      "pcmpeqw      %%xmm4,%%xmm4                \n"
-      "psrlw        $15,%%xmm4                   \n"
-      "psllw        $1,%%xmm4                    \n"  // all 2
-      "movdqu       %3,%%xmm3                    \n"
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
+      "psrlw       $15,%%xmm4                    \n"
+      "psllw       $1,%%xmm4                     \n"  // all 2
+      "movdqu      %3,%%xmm3                     \n"

      LABELALIGN
      "1:                                        \n"
-      "movq         (%0),%%xmm0                  \n"  // 01234567
-      "movq         1(%0),%%xmm1                 \n"  // 12345678
-      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
-      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
-      "movdqa       %%xmm0,%%xmm2                \n"
-      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
-      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
-      "pmaddubsw    %%xmm3,%%xmm2                \n"  // 3*near+far (hi)
-      "pmaddubsw    %%xmm3,%%xmm0                \n"  // 3*near+far (lo)
-      "paddw        %%xmm4,%%xmm0                \n"  // 3*near+far+2 (lo)
-      "paddw        %%xmm4,%%xmm2                \n"  // 3*near+far+2 (hi)
-      "psrlw        $2,%%xmm0                    \n"  // 3/4*near+1/4*far (lo)
-      "psrlw        $2,%%xmm2                    \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb    %%xmm2,%%xmm0,%%xmm0         \n"
-      "vmovdqu      %%xmm0,(%1)                  \n"
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm3,%%xmm2                 \n"  // 3*near+far (hi)
+      "pmaddubsw   %%xmm3,%%xmm0                 \n"  // 3*near+far (lo)
+      "paddw       %%xmm4,%%xmm0                 \n"  // 3*near+far+2 (lo)
+      "paddw       %%xmm4,%%xmm2                 \n"  // 3*near+far+2 (hi)
+      "psrlw       $2,%%xmm0                     \n"  // 3/4*near+1/4*far (lo)
+      "psrlw       $2,%%xmm2                     \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%xmm2,%%xmm0,%%xmm0          \n"
+      "vmovdqu     %%xmm0,(%1)                   \n"

      "lea         0x8(%0),%0                    \n"
      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
@ -1167,76 +1156,75 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
  asm volatile(
-
-      "pcmpeqw      %%xmm6,%%xmm6                \n"
-      "psrlw        $15,%%xmm6                   \n"
-      "psllw        $3,%%xmm6                    \n"  // all 8
-      "movdqu       %5,%%xmm7                    \n"
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
+      "psrlw       $15,%%xmm6                    \n"
+      "psllw       $3,%%xmm6                     \n"  // all 8
+      "movdqu      %5,%%xmm7                     \n"

      LABELALIGN
      "1:                                        \n"
-      "movq         (%0),%%xmm0                  \n"  // 01234567
-      "movq         1(%0),%%xmm1                 \n"  // 12345678
-      "punpcklwd    %%xmm0,%%xmm0                \n"  // 0101232345456767
-      "punpcklwd    %%xmm1,%%xmm1                \n"  // 1212343456567878
-      "movdqa       %%xmm0,%%xmm2                \n"
-      "punpckhdq    %%xmm1,%%xmm2                \n"  // 4545565667677878
-      "punpckldq    %%xmm1,%%xmm0                \n"  // 0101121223233434
-      "pmaddubsw    %%xmm7,%%xmm2                \n"  // 3*near+far (1, hi)
-      "pmaddubsw    %%xmm7,%%xmm0                \n"  // 3*near+far (1, lo)
+      "movq        (%0),%%xmm0                   \n"  // 01234567
+      "movq        1(%0),%%xmm1                  \n"  // 12345678
+      "punpcklwd   %%xmm0,%%xmm0                 \n"  // 0101232345456767
+      "punpcklwd   %%xmm1,%%xmm1                 \n"  // 1212343456567878
+      "movdqa      %%xmm0,%%xmm2                 \n"
+      "punpckhdq   %%xmm1,%%xmm2                 \n"  // 4545565667677878
+      "punpckldq   %%xmm1,%%xmm0                 \n"  // 0101121223233434
+      "pmaddubsw   %%xmm7,%%xmm2                 \n"  // 3*near+far (1, hi)
+      "pmaddubsw   %%xmm7,%%xmm0                 \n"  // 3*near+far (1, lo)

-      "movq         (%0,%3),%%xmm1               \n"
-      "movq         1(%0,%3),%%xmm4              \n"
-      "punpcklwd    %%xmm1,%%xmm1                \n"
-      "punpcklwd    %%xmm4,%%xmm4                \n"
-      "movdqa       %%xmm1,%%xmm3                \n"
-      "punpckhdq    %%xmm4,%%xmm3                \n"
-      "punpckldq    %%xmm4,%%xmm1                \n"
-      "pmaddubsw    %%xmm7,%%xmm3                \n"  // 3*near+far (2, hi)
-      "pmaddubsw    %%xmm7,%%xmm1                \n"  // 3*near+far (2, lo)
+      "movq        (%0,%3),%%xmm1                \n"
+      "movq        1(%0,%3),%%xmm4               \n"
+      "punpcklwd   %%xmm1,%%xmm1                 \n"
+      "punpcklwd   %%xmm4,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm3                 \n"
+      "punpckhdq   %%xmm4,%%xmm3                 \n"
+      "punpckldq   %%xmm4,%%xmm1                 \n"
+      "pmaddubsw   %%xmm7,%%xmm3                 \n"  // 3*near+far (2, hi)
+      "pmaddubsw   %%xmm7,%%xmm1                 \n"  // 3*near+far (2, lo)

      // xmm0 xmm2
      // xmm1 xmm3

-      "movdqa       %%xmm0,%%xmm4                \n"
-      "movdqa       %%xmm1,%%xmm5                \n"
-      "paddw        %%xmm0,%%xmm4                \n"  // 6*near+2*far (1, lo)
-      "paddw        %%xmm6,%%xmm5                \n"  // 3*near+far+8 (2, lo)
-      "paddw        %%xmm0,%%xmm4                \n"  // 9*near+3*far (1, lo)
-      "paddw        %%xmm5,%%xmm4                \n"  // 9 3 3 1 + 8 (1, lo)
-      "psrlw        $4,%%xmm4                    \n"  // ^ div by 16 (1, lo)
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm0,%%xmm4                 \n"  // 6*near+2*far (1, lo)
+      "paddw       %%xmm6,%%xmm5                 \n"  // 3*near+far+8 (2, lo)
+      "paddw       %%xmm0,%%xmm4                 \n"  // 9*near+3*far (1, lo)
+      "paddw       %%xmm5,%%xmm4                 \n"  // 9 3 3 1 + 8 (1, lo)
+      "psrlw       $4,%%xmm4                     \n"  // ^ div by 16 (1, lo)

-      "movdqa       %%xmm1,%%xmm5                \n"
-      "paddw        %%xmm1,%%xmm5                \n"  // 6*near+2*far (2, lo)
-      "paddw        %%xmm6,%%xmm0                \n"  // 3*near+far+8 (1, lo)
-      "paddw        %%xmm1,%%xmm5                \n"  // 9*near+3*far (2, lo)
-      "paddw        %%xmm0,%%xmm5                \n"  // 9 3 3 1 + 8 (2, lo)
-      "psrlw        $4,%%xmm5                    \n"  // ^ div by 16 (2, lo)
+      "movdqa      %%xmm1,%%xmm5                 \n"
+      "paddw       %%xmm1,%%xmm5                 \n"  // 6*near+2*far (2, lo)
+      "paddw       %%xmm6,%%xmm0                 \n"  // 3*near+far+8 (1, lo)
+      "paddw       %%xmm1,%%xmm5                 \n"  // 9*near+3*far (2, lo)
+      "paddw       %%xmm0,%%xmm5                 \n"  // 9 3 3 1 + 8 (2, lo)
+      "psrlw       $4,%%xmm5                     \n"  // ^ div by 16 (2, lo)

-      "movdqa       %%xmm2,%%xmm0                \n"
-      "movdqa       %%xmm3,%%xmm1                \n"
-      "paddw        %%xmm2,%%xmm0                \n"  // 6*near+2*far (1, hi)
-      "paddw        %%xmm6,%%xmm1                \n"  // 3*near+far+8 (2, hi)
-      "paddw        %%xmm2,%%xmm0                \n"  // 9*near+3*far (1, hi)
-      "paddw        %%xmm1,%%xmm0                \n"  // 9 3 3 1 + 8 (1, hi)
-      "psrlw        $4,%%xmm0                    \n"  // ^ div by 16 (1, hi)
+      "movdqa      %%xmm2,%%xmm0                 \n"
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm2,%%xmm0                 \n"  // 6*near+2*far (1, hi)
+      "paddw       %%xmm6,%%xmm1                 \n"  // 3*near+far+8 (2, hi)
+      "paddw       %%xmm2,%%xmm0                 \n"  // 9*near+3*far (1, hi)
+      "paddw       %%xmm1,%%xmm0                 \n"  // 9 3 3 1 + 8 (1, hi)
+      "psrlw       $4,%%xmm0                     \n"  // ^ div by 16 (1, hi)

-      "movdqa       %%xmm3,%%xmm1                \n"
-      "paddw        %%xmm3,%%xmm1                \n"  // 6*near+2*far (2, hi)
-      "paddw        %%xmm6,%%xmm2                \n"  // 3*near+far+8 (1, hi)
-      "paddw        %%xmm3,%%xmm1                \n"  // 9*near+3*far (2, hi)
-      "paddw        %%xmm2,%%xmm1                \n"  // 9 3 3 1 + 8 (2, hi)
-      "psrlw        $4,%%xmm1                    \n"  // ^ div by 16 (2, hi)
+      "movdqa      %%xmm3,%%xmm1                 \n"
+      "paddw       %%xmm3,%%xmm1                 \n"  // 6*near+2*far (2, hi)
+      "paddw       %%xmm6,%%xmm2                 \n"  // 3*near+far+8 (1, hi)
+      "paddw       %%xmm3,%%xmm1                 \n"  // 9*near+3*far (2, hi)
+      "paddw       %%xmm2,%%xmm1                 \n"  // 9 3 3 1 + 8 (2, hi)
+      "psrlw       $4,%%xmm1                     \n"  // ^ div by 16 (2, hi)

-      "packuswb     %%xmm0,%%xmm4                \n"
-      "movdqu       %%xmm4,(%1)                  \n"  // store above
-      "packuswb     %%xmm1,%%xmm5                \n"
-      "movdqu       %%xmm5,(%1,%4)               \n"  // store below
+      "packuswb    %%xmm0,%%xmm4                 \n"
+      "movdqu      %%xmm4,(%1)                   \n"  // store above
+      "packuswb    %%xmm1,%%xmm5                 \n"
+      "movdqu      %%xmm5,(%1,%4)                \n"  // store below

-      "lea          0x8(%0),%0                   \n"
-      "lea          0x10(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x8(%0),%0                    \n"
+      "lea         0x10(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),                // %0
        "+r"(dst_ptr),                // %1
        "+r"(dst_width)               // %2
@ -1257,30 +1245,29 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
  asm volatile(
-
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
-      "vmovdqu      %3,%%ymm3                    \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"

      LABELALIGN
      "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
-      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
-      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
-      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
-      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
-      "vpmaddubsw   %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpmaddubsw   %%ymm3,%%ymm0,%%ymm0         \n"  // 3*near+far (lo)
-      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
-      "vpaddw       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
-      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
-      "vpsrlw       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
-      "vpackuswb    %%ymm1,%%ymm0,%%ymm0         \n"
-      "vmovdqu      %%ymm0,(%1)                  \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0          \n"  // 3*near+far (lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddw      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrlw      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"

      "lea         0x10(%0),%0                   \n"
      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
@ -1301,72 +1288,71 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
  asm volatile(
-
-      "vpcmpeqw     %%ymm6,%%ymm6,%%ymm6         \n"
-      "vpsrlw       $15,%%ymm6,%%ymm6            \n"
-      "vpsllw       $3,%%ymm6,%%ymm6             \n"  // all 8
-      "vmovdqu      %5,%%ymm7                    \n"
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
+      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"

      LABELALIGN
      "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0),%%xmm1                 \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"
-      "vpunpcklwd   %%ymm0,%%ymm0,%%ymm0         \n"
-      "vpunpcklwd   %%ymm1,%%ymm1,%%ymm1         \n"
-      "vpunpckhdq   %%ymm1,%%ymm0,%%ymm2         \n"
-      "vpunpckldq   %%ymm1,%%ymm0,%%ymm0         \n"
-      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
-      "vpmaddubsw   %%ymm7,%%ymm0,%%ymm0         \n"  // 3*near+far (1, lo)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0),%%xmm1                  \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"
+      "vpunpcklwd  %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpunpcklwd  %%ymm1,%%ymm1,%%ymm1          \n"
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm2          \n"
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm0          \n"
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpmaddubsw  %%ymm7,%%ymm0,%%ymm0          \n"  // 3*near+far (1, lo)

-      "vmovdqu      (%0,%3),%%xmm2               \n"  // 0123456789ABCDEF
-      "vmovdqu      1(%0,%3),%%xmm3              \n"  // 123456789ABCDEF0
-      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"
-      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"
-      "vpunpcklwd   %%ymm2,%%ymm2,%%ymm2         \n"
-      "vpunpcklwd   %%ymm3,%%ymm3,%%ymm3         \n"
-      "vpunpckhdq   %%ymm3,%%ymm2,%%ymm4         \n"
-      "vpunpckldq   %%ymm3,%%ymm2,%%ymm2         \n"
-      "vpmaddubsw   %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
-      "vpmaddubsw   %%ymm7,%%ymm2,%%ymm2         \n"  // 3*near+far (2, lo)
+      "vmovdqu     (%0,%3),%%xmm2                \n"  // 0123456789ABCDEF
+      "vmovdqu     1(%0,%3),%%xmm3               \n"  // 123456789ABCDEF0
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"
+      "vpunpcklwd  %%ymm2,%%ymm2,%%ymm2          \n"
+      "vpunpcklwd  %%ymm3,%%ymm3,%%ymm3          \n"
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm4          \n"
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm2          \n"
+      "vpmaddubsw  %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)
+      "vpmaddubsw  %%ymm7,%%ymm2,%%ymm2          \n"  // 3*near+far (2, lo)

      // ymm0 ymm1
      // ymm2 ymm3

-      "vpaddw       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
-      "vpaddw       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
-      "vpaddw       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
-      "vpaddw       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrlw       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
+      "vpaddw      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddw      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddw      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddw      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrlw      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)

-      "vpaddw       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
-      "vpaddw       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
-      "vpaddw       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
-      "vpaddw       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrlw       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
+      "vpaddw      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddw      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddw      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddw      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrlw      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)

-      "vpaddw       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
-      "vpaddw       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
-      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
+      "vpaddw      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddw      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)

-      "vpaddw       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
-      "vpaddw       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
-      "vpaddw       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
-      "vpaddw       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrlw       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
+      "vpaddw      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddw      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddw      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddw      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrlw      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)

-      "vpackuswb    %%ymm0,%%ymm4,%%ymm4         \n"
-      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
-      "vpackuswb    %%ymm2,%%ymm5,%%ymm5         \n"
-      "vmovdqu      %%ymm5,(%1,%4)               \n"  // store below
+      "vpackuswb   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackuswb   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4)                \n"  // store below

-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 16 sample to 32 sample
-      "sub          $0x20,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 16 sample to 32 sample
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),                // %0
        "+r"(dst_ptr),                // %1
        "+r"(dst_width)               // %2
@ -1386,35 +1372,34 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
  asm volatile(
-
-      "vmovdqu      %3,%%ymm3                    \n"
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $1,%%ymm4,%%ymm4             \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2

      LABELALIGN
      "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)

-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000

-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
-      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"  // 3*near+far
-      "vpaddw       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2
-      "vpsrlw       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far
-      "vmovdqu      %%ymm0,(%1)                  \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm3,%%ymm1,%%ymm0          \n"  // 3*near+far (lo)
+      "vpmaddwd    %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm0          \n"  // 3*near+far
+      "vpaddw      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2
+      "vpsrlw      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far
+      "vmovdqu     %%ymm0,(%1)                   \n"

-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),              // %0
        "+r"(dst_ptr),              // %1
        "+r"(dst_width)             // %2
@ -1427,37 +1412,36 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     int dst_width) {
  asm volatile(
-
-      "vmovdqu      %3,%%ymm3                    \n"
-      "vpcmpeqd     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrld       $31,%%ymm4,%%ymm4            \n"
-      "vpslld       $1,%%ymm4,%%ymm4             \n"  // all 2
+      "vmovdqu     %3,%%ymm3                     \n"
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrld      $31,%%ymm4,%%ymm4             \n"
+      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2

      LABELALIGN
      "1:                                        \n"
-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)

-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000

-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm3,%%ymm1,%%ymm0         \n"  // 3*near+far (lo)
-      "vpmaddwd     %%ymm3,%%ymm2,%%ymm1         \n"  // 3*near+far (hi)
-      "vpaddd       %%ymm4,%%ymm0,%%ymm0         \n"  // 3*near+far+2 (lo)
-      "vpaddd       %%ymm4,%%ymm1,%%ymm1         \n"  // 3*near+far+2 (hi)
-      "vpsrad       $2,%%ymm0,%%ymm0             \n"  // 3/4*near+1/4*far (lo)
-      "vpsrad       $2,%%ymm1,%%ymm1             \n"  // 3/4*near+1/4*far (hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm0         \n"
-      "vmovdqu      %%ymm0,(%1)                  \n"
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm3,%%ymm1,%%ymm0          \n"  // 3*near+far (lo)
+      "vpmaddwd    %%ymm3,%%ymm2,%%ymm1          \n"  // 3*near+far (hi)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm0          \n"  // 3*near+far+2 (lo)
+      "vpaddd      %%ymm4,%%ymm1,%%ymm1          \n"  // 3*near+far+2 (hi)
+      "vpsrad      $2,%%ymm0,%%ymm0              \n"  // 3/4*near+1/4*far (lo)
+      "vpsrad      $2,%%ymm1,%%ymm1              \n"  // 3/4*near+1/4*far (hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%1)                   \n"

-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),              // %0
        "+r"(dst_ptr),              // %1
        "+r"(dst_width)             // %2
@ -1473,57 +1457,56 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
  asm volatile(
-
-      "vmovdqu      %5,%%ymm5                    \n"
-      "vpcmpeqw     %%ymm4,%%ymm4,%%ymm4         \n"
-      "vpsrlw       $15,%%ymm4,%%ymm4            \n"
-      "vpsllw       $3,%%ymm4,%%ymm4             \n"  // all 8
+      "vmovdqu     %5,%%ymm5                     \n"
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
+      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8

      LABELALIGN
      "1:                                        \n"

-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
-      "vpmaddwd     %%ymm5,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm2         \n"  // 3*near+far (1)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm5,%%ymm1,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpmaddwd    %%ymm5,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm2          \n"  // 3*near+far (1)

-      "vmovdqu      (%0,%3,2),%%xmm0             \n"  // 01234567 (16b)
-      "vmovdqu      2(%0,%3,2),%%xmm1            \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm3         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm5,%%ymm1,%%ymm0         \n"  // 3*near+far (2, lo)
-      "vpmaddwd     %%ymm5,%%ymm3,%%ymm1         \n"  // 3*near+far (2, hi)
-      "vpackssdw    %%ymm1,%%ymm0,%%ymm3         \n"  // 3*near+far (2)
+      "vmovdqu     (%0,%3,2),%%xmm0              \n"  // 01234567 (16b)
+      "vmovdqu     2(%0,%3,2),%%xmm1             \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm3          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm5,%%ymm1,%%ymm0          \n"  // 3*near+far (2, lo)
+      "vpmaddwd    %%ymm5,%%ymm3,%%ymm1          \n"  // 3*near+far (2, hi)
+      "vpackssdw   %%ymm1,%%ymm0,%%ymm3          \n"  // 3*near+far (2)

-      "vpaddw       %%ymm2,%%ymm2,%%ymm0         \n"  // 6*near+2*far (1)
-      "vpaddw       %%ymm4,%%ymm3,%%ymm1         \n"  // 3*near+far+8 (2)
-      "vpaddw       %%ymm0,%%ymm2,%%ymm0         \n"  // 9*near+3*far (1)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (1)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
-      "vmovdqu      %%ymm0,(%1)                  \n"  // store above
+      "vpaddw      %%ymm2,%%ymm2,%%ymm0          \n"  // 6*near+2*far (1)
+      "vpaddw      %%ymm4,%%ymm3,%%ymm1          \n"  // 3*near+far+8 (2)
+      "vpaddw      %%ymm0,%%ymm2,%%ymm0          \n"  // 9*near+3*far (1)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (1)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1)                   \n"  // store above

-      "vpaddw       %%ymm3,%%ymm3,%%ymm0         \n"  // 6*near+2*far (2)
-      "vpaddw       %%ymm4,%%ymm2,%%ymm1         \n"  // 3*near+far+8 (1)
-      "vpaddw       %%ymm0,%%ymm3,%%ymm0         \n"  // 9*near+3*far (2)
-      "vpaddw       %%ymm0,%%ymm1,%%ymm0         \n"  // 9 3 3 1 + 8 (2)
-      "vpsrlw       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16
-      "vmovdqu      %%ymm0,(%1,%4,2)             \n"  // store below
+      "vpaddw      %%ymm3,%%ymm3,%%ymm0          \n"  // 6*near+2*far (2)
+      "vpaddw      %%ymm4,%%ymm2,%%ymm1          \n"  // 3*near+far+8 (1)
+      "vpaddw      %%ymm0,%%ymm3,%%ymm0          \n"  // 9*near+3*far (2)
+      "vpaddw      %%ymm0,%%ymm1,%%ymm0          \n"  // 9 3 3 1 + 8 (2)
+      "vpsrlw      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16
+      "vmovdqu     %%ymm0,(%1,%4,2)              \n"  // store below

-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),                // %0
        "+r"(dst_ptr),                // %1
        "+r"(dst_width)               // %2
@ -1540,70 +1523,69 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
                                       ptrdiff_t dst_stride,
                                       int dst_width) {
  asm volatile(
-
-      "vmovdqu      %5,%%ymm7                    \n"
-      "vpcmpeqd     %%ymm6,%%ymm6,%%ymm6         \n"
-      "vpsrld       $31,%%ymm6,%%ymm6            \n"
-      "vpslld       $3,%%ymm6,%%ymm6             \n"  // all 8
+      "vmovdqu     %5,%%ymm7                     \n"
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+      "vpsrld      $31,%%ymm6,%%ymm6             \n"
+      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8

      LABELALIGN
      "1:                                        \n"

-      "vmovdqu      (%0),%%xmm0                  \n"  // 01234567 (16b)
-      "vmovdqu      2(%0),%%xmm1                 \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm0,%%ymm0    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm1,%%ymm1    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm0,%%ymm0,%%ymm0         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm1,%%ymm1,%%ymm1         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm1,%%ymm0,%%ymm2         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm1,%%ymm0,%%ymm1         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm7,%%ymm1,%%ymm0         \n"  // 3*near+far (1, lo)
-      "vpmaddwd     %%ymm7,%%ymm2,%%ymm1         \n"  // 3*near+far (1, hi)
+      "vmovdqu     (%0),%%xmm0                   \n"  // 01234567 (16b)
+      "vmovdqu     2(%0),%%xmm1                  \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm0,%%ymm0     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm1,%%ymm1     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm0,%%ymm0,%%ymm0          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm1,%%ymm1,%%ymm1          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm1,%%ymm0,%%ymm2          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm1,%%ymm0,%%ymm1          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm7,%%ymm1,%%ymm0          \n"  // 3*near+far (1, lo)
+      "vpmaddwd    %%ymm7,%%ymm2,%%ymm1          \n"  // 3*near+far (1, hi)

-      "vmovdqu      (%0,%3,2),%%xmm2             \n"  // 01234567 (16b)
-      "vmovdqu      2(%0,%3,2),%%xmm3            \n"  // 12345678 (16b)
-      "vpermq       $0b11011000,%%ymm2,%%ymm2    \n"  // 0123000045670000
-      "vpermq       $0b11011000,%%ymm3,%%ymm3    \n"  // 1234000056780000
-      "vpunpckldq   %%ymm2,%%ymm2,%%ymm2         \n"  // 0101232345456767
-      "vpunpckldq   %%ymm3,%%ymm3,%%ymm3         \n"  // 1212343456567878
-      "vpunpckhqdq  %%ymm3,%%ymm2,%%ymm4         \n"  // 2323343467677878
-      "vpunpcklqdq  %%ymm3,%%ymm2,%%ymm3         \n"  // 0101121245455656
-      "vpmaddwd     %%ymm7,%%ymm3,%%ymm2         \n"  // 3*near+far (2, lo)
-      "vpmaddwd     %%ymm7,%%ymm4,%%ymm3         \n"  // 3*near+far (2, hi)
+      "vmovdqu     (%0,%3,2),%%xmm2              \n"  // 01234567 (16b)
+      "vmovdqu     2(%0,%3,2),%%xmm3             \n"  // 12345678 (16b)
+      "vpermq      $0b11011000,%%ymm2,%%ymm2     \n"  // 0123000045670000
+      "vpermq      $0b11011000,%%ymm3,%%ymm3     \n"  // 1234000056780000
+      "vpunpckldq  %%ymm2,%%ymm2,%%ymm2          \n"  // 0101232345456767
+      "vpunpckldq  %%ymm3,%%ymm3,%%ymm3          \n"  // 1212343456567878
+      "vpunpckhqdq %%ymm3,%%ymm2,%%ymm4          \n"  // 2323343467677878
+      "vpunpcklqdq %%ymm3,%%ymm2,%%ymm3          \n"  // 0101121245455656
+      "vpmaddwd    %%ymm7,%%ymm3,%%ymm2          \n"  // 3*near+far (2, lo)
+      "vpmaddwd    %%ymm7,%%ymm4,%%ymm3          \n"  // 3*near+far (2, hi)

-      "vpaddd       %%ymm0,%%ymm0,%%ymm4         \n"  // 6*near+2*far (1, lo)
-      "vpaddd       %%ymm6,%%ymm2,%%ymm5         \n"  // 3*near+far+8 (2, lo)
-      "vpaddd       %%ymm4,%%ymm0,%%ymm4         \n"  // 9*near+3*far (1, lo)
-      "vpaddd       %%ymm4,%%ymm5,%%ymm4         \n"  // 9 3 3 1 + 8 (1, lo)
-      "vpsrad       $4,%%ymm4,%%ymm4             \n"  // ^ div by 16 (1, lo)
+      "vpaddd      %%ymm0,%%ymm0,%%ymm4          \n"  // 6*near+2*far (1, lo)
+      "vpaddd      %%ymm6,%%ymm2,%%ymm5          \n"  // 3*near+far+8 (2, lo)
+      "vpaddd      %%ymm4,%%ymm0,%%ymm4          \n"  // 9*near+3*far (1, lo)
+      "vpaddd      %%ymm4,%%ymm5,%%ymm4          \n"  // 9 3 3 1 + 8 (1, lo)
+      "vpsrad      $4,%%ymm4,%%ymm4              \n"  // ^ div by 16 (1, lo)

-      "vpaddd       %%ymm2,%%ymm2,%%ymm5         \n"  // 6*near+2*far (2, lo)
-      "vpaddd       %%ymm6,%%ymm0,%%ymm0         \n"  // 3*near+far+8 (1, lo)
-      "vpaddd       %%ymm5,%%ymm2,%%ymm5         \n"  // 9*near+3*far (2, lo)
-      "vpaddd       %%ymm5,%%ymm0,%%ymm5         \n"  // 9 3 3 1 + 8 (2, lo)
-      "vpsrad       $4,%%ymm5,%%ymm5             \n"  // ^ div by 16 (2, lo)
+      "vpaddd      %%ymm2,%%ymm2,%%ymm5          \n"  // 6*near+2*far (2, lo)
+      "vpaddd      %%ymm6,%%ymm0,%%ymm0          \n"  // 3*near+far+8 (1, lo)
+      "vpaddd      %%ymm5,%%ymm2,%%ymm5          \n"  // 9*near+3*far (2, lo)
+      "vpaddd      %%ymm5,%%ymm0,%%ymm5          \n"  // 9 3 3 1 + 8 (2, lo)
+      "vpsrad      $4,%%ymm5,%%ymm5              \n"  // ^ div by 16 (2, lo)

-      "vpaddd       %%ymm1,%%ymm1,%%ymm0         \n"  // 6*near+2*far (1, hi)
-      "vpaddd       %%ymm6,%%ymm3,%%ymm2         \n"  // 3*near+far+8 (2, hi)
-      "vpaddd       %%ymm0,%%ymm1,%%ymm0         \n"  // 9*near+3*far (1, hi)
-      "vpaddd       %%ymm0,%%ymm2,%%ymm0         \n"  // 9 3 3 1 + 8 (1, hi)
-      "vpsrad       $4,%%ymm0,%%ymm0             \n"  // ^ div by 16 (1, hi)
+      "vpaddd      %%ymm1,%%ymm1,%%ymm0          \n"  // 6*near+2*far (1, hi)
+      "vpaddd      %%ymm6,%%ymm3,%%ymm2          \n"  // 3*near+far+8 (2, hi)
+      "vpaddd      %%ymm0,%%ymm1,%%ymm0          \n"  // 9*near+3*far (1, hi)
+      "vpaddd      %%ymm0,%%ymm2,%%ymm0          \n"  // 9 3 3 1 + 8 (1, hi)
+      "vpsrad      $4,%%ymm0,%%ymm0              \n"  // ^ div by 16 (1, hi)

-      "vpaddd       %%ymm3,%%ymm3,%%ymm2         \n"  // 6*near+2*far (2, hi)
-      "vpaddd       %%ymm6,%%ymm1,%%ymm1         \n"  // 3*near+far+8 (1, hi)
-      "vpaddd       %%ymm2,%%ymm3,%%ymm2         \n"  // 9*near+3*far (2, hi)
-      "vpaddd       %%ymm2,%%ymm1,%%ymm2         \n"  // 9 3 3 1 + 8 (2, hi)
-      "vpsrad       $4,%%ymm2,%%ymm2             \n"  // ^ div by 16 (2, hi)
+      "vpaddd      %%ymm3,%%ymm3,%%ymm2          \n"  // 6*near+2*far (2, hi)
+      "vpaddd      %%ymm6,%%ymm1,%%ymm1          \n"  // 3*near+far+8 (1, hi)
+      "vpaddd      %%ymm2,%%ymm3,%%ymm2          \n"  // 9*near+3*far (2, hi)
+      "vpaddd      %%ymm2,%%ymm1,%%ymm2          \n"  // 9 3 3 1 + 8 (2, hi)
+      "vpsrad      $4,%%ymm2,%%ymm2              \n"  // ^ div by 16 (2, hi)

-      "vpackssdw    %%ymm0,%%ymm4,%%ymm4         \n"
-      "vmovdqu      %%ymm4,(%1)                  \n"  // store above
-      "vpackssdw    %%ymm2,%%ymm5,%%ymm5         \n"
-      "vmovdqu      %%ymm5,(%1,%4,2)             \n"  // store below
+      "vpackssdw   %%ymm0,%%ymm4,%%ymm4          \n"
+      "vmovdqu     %%ymm4,(%1)                   \n"  // store above
+      "vpackssdw   %%ymm2,%%ymm5,%%ymm5          \n"
+      "vmovdqu     %%ymm5,(%1,%4,2)              \n"  // store below

-      "lea          0x10(%0),%0                  \n"
-      "lea          0x20(%1),%1                  \n"  // 8 sample to 16 sample
-      "sub          $0x10,%2                     \n"
-      "jg           1b                           \n"
+      "lea         0x10(%0),%0                   \n"
+      "lea         0x20(%1),%1                   \n"  // 8 sample to 16 sample
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
      : "+r"(src_ptr),                // %0
        "+r"(dst_ptr),                // %1
        "+r"(dst_width)               // %2
@ -1620,7 +1602,6 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
  asm volatile(
-
      "pxor        %%xmm5,%%xmm5                 \n"

      // 16 pixel loop.
@ -1653,7 +1634,6 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
  asm volatile(
-
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"

      LABELALIGN
@ -1776,8 +1756,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
        "x"(kFsub80),  // %8
        "x"(kFadd40)   // %9
 #else
-        "m"(kFsub80),  // %8
-        "m"(kFadd40)   // %9
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
 #endif
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
@ -1793,7 +1773,6 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
  (void)x;
  (void)dx;
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
@ -1820,7 +1799,6 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                            int dst_width) {
  (void)src_stride;
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -1844,7 +1822,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                  int dst_width) {
  (void)src_stride;
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -1870,7 +1847,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                               uint8_t* dst_argb,
                               int dst_width) {
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
@ -2057,7 +2033,6 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
  (void)x;
  (void)dx;
  asm volatile(
-
      LABELALIGN
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -509,7 +509,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                             int dst_width) {
  const uint8_t* src_temp = src_ptr + 1;
  asm volatile(
-
      "vmov.u16    q15, #3                       \n"

      "1:                                        \n"
@ -527,7 +526,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,

      "vst2.8      {d0, d1}, [%1]!               \n"  // store
      "subs        %2, %2, #16                   \n"  // 8 sample -> 16 sample
-      "bgt        1b                             \n"
+      "bgt         1b                            \n"
      : "+r"(src_ptr),    // %0
        "+r"(dst_ptr),    // %1
        "+r"(dst_width),  // %2
@ -548,7 +547,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp1 = src_ptr1 + 1;

  asm volatile(
-
      "vmov.u16    q15, #3                       \n"

      "1:                                        \n"
@ -612,7 +610,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
  asm volatile(
-
      "vmov.u16    q15, #3                       \n"

      "1:                                        \n"
@ -649,7 +646,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp1 = src_ptr1 + 1;

  asm volatile(
-
      "vmov.u16    q15, #3                       \n"

      "1:                                        \n"
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -540,7 +540,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
                             int dst_width) {
  const uint8_t* src_temp = src_ptr + 1;
  asm volatile(
-
      "movi        v31.8b, #3                    \n"

      "1:                                        \n"
@ -580,7 +579,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
  const uint8_t* src_temp1 = src_ptr1 + 1;

  asm volatile(
-
      "movi        v31.8b, #3                    \n"
      "movi        v30.8h, #3                    \n"

@ -637,7 +635,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
                                int dst_width) {
  const uint16_t* src_temp = src_ptr + 1;
  asm volatile(
-
      "movi        v31.8h, #3                    \n"

      "1:                                        \n"
@ -675,7 +672,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
  const uint16_t* src_temp1 = src_ptr1 + 1;

  asm volatile(
-
      "movi        v31.8h, #3                    \n"

      "1:                                        \n"
@ -1317,13 +1313,13 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
  (void)src_stride;
  asm volatile(
      "1:                                        \n"
-      "ld1        {v0.h}[0], [%0], %6            \n"
-      "ld1        {v1.h}[0], [%1], %6            \n"
-      "ld1        {v2.h}[0], [%2], %6            \n"
-      "ld1        {v3.h}[0], [%3], %6            \n"
-      "subs       %w5, %w5, #4                   \n"  // 4 pixels per loop.
-      "st4        {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
-      "b.gt       1b                             \n"
+      "ld1         {v0.h}[0], [%0], %6           \n"
+      "ld1         {v1.h}[0], [%1], %6           \n"
+      "ld1         {v2.h}[0], [%2], %6           \n"
+      "ld1         {v3.h}[0], [%3], %6           \n"
+      "subs        %w5, %w5, #4                  \n"  // 4 pixels per loop.
+      "st4         {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
+      "b.gt        1b                            \n"
      : "+r"(src_ptr),                 // %0
        "+r"(src1_ptr),                // %1
        "+r"(src2_ptr),                // %2
--- a/unit_test/color_test.cc
+++ b/unit_test/color_test.cc
@ -257,6 +257,32 @@ static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
  *r = orig_pixels[2];
 }

+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
+static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
+  const int kWidth = 16;
+  const int kHeight = 1;
+  const int kPixels = kWidth * kHeight;
+  const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
+
+  SIMD_ALIGNED(uint8_t orig_y[16]);
+  SIMD_ALIGNED(uint8_t orig_u[8]);
+  SIMD_ALIGNED(uint8_t orig_v[8]);
+  SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
+  memset(orig_y, y, kPixels);
+  memset(orig_u, u, kHalfPixels);
+  memset(orig_v, v, kHalfPixels);
+
+  /* YUV converted to ARGB. */
+  V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
+             orig_pixels, kWidth * 4, kWidth, kHeight);
+
+  *b = orig_pixels[0];
+  *g = orig_pixels[1];
+  *r = orig_pixels[2];
+}
+
 static void YToRGB(int y, int* r, int* g, int* b) {
  const int kWidth = 16;
  const int kHeight = 1;
@ -405,21 +431,21 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
  EXPECT_LE(allb, 255);
 }

-// BT.601 YUV to RGB reference
+// BT.601 limited range YUV to RGB reference
 static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
  *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
  *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
 }

-// JPEG YUV to RGB reference
+// BT.601 full range YUV to RGB reference (aka JPEG)
 static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *r = RoundToByte(y - (v - 128) * -1.40200);
  *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
  *b = RoundToByte(y - (u - 128) * -1.77200);
 }

-// BT.709 YUV to RGB reference
+// BT.709 limited range YUV to RGB reference
 // See also http://www.equasys.de/colorconversion.html
 static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
@ -434,7 +460,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *b = RoundToByte(y - (u - 128) * -1.8556);
 }

-// BT.2020 YUV to RGB reference
+// BT.2020 limited range YUV to RGB reference
 static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
  *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
@ -442,6 +468,13 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
  *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
 }

+// BT.2020 full range YUV to RGB reference
+static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
+  *r = RoundToByte(y                        + (v - 128) * 1.474600);
+  *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
+  *b = RoundToByte(y + (u - 128) * 1.881400);
+}
+
 TEST_F(LibYUVColorTest, TestYUV) {
  int r0, g0, b0, r1, g1, b1;

@ -573,16 +606,12 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
 #else
 #define FASTSTEP 5
 #endif
+
+// BT.601 limited range.
 TEST_F(LibYUVColorTest, TestFullYUV) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
  for (int u = 0; u < 256; ++u) {
    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -602,16 +631,11 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
  PrintHistogram(rh, gh, bh);
 }

+// BT.601 full range.
 TEST_F(LibYUVColorTest, TestFullYUVJ) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
  for (int u = 0; u < 256; ++u) {
    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -631,16 +655,11 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
  PrintHistogram(rh, gh, bh);
 }

+// BT.709 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVH) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
  for (int u = 0; u < 256; ++u) {
    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -661,16 +680,11 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
  PrintHistogram(rh, gh, bh);
 }

+// BT.709 full range.
 TEST_F(LibYUVColorTest, TestFullYUVF) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
  for (int u = 0; u < 256; ++u) {
    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -690,16 +704,11 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
  PrintHistogram(rh, gh, bh);
 }

+// BT.2020 limited range.
 TEST_F(LibYUVColorTest, TestFullYUVU) {
-  int rh[256] = {
-      0,
-  };
-  int gh[256] = {
-      0,
-  };
-  int bh[256] = {
-      0,
-  };
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
  for (int u = 0; u < 256; ++u) {
    for (int v = 0; v < 256; ++v) {
      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -719,6 +728,30 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
  }
  PrintHistogram(rh, gh, bh);
 }
+
+// BT.2020 full range.
+TEST_F(LibYUVColorTest, TestFullYUVV) {
+  int rh[256] = { 0, };
+  int gh[256] = { 0, };
+  int bh[256] = { 0, };
+  for (int u = 0; u < 256; ++u) {
+    for (int v = 0; v < 256; ++v) {
+      for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
+        int r0, g0, b0, r1, g1, b1;
+        int y = RANDOM256(y2);
+        YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
+        YUVVToRGB(y, u, v, &r1, &g1, &b1);
+        EXPECT_NEAR(r0, r1, ERROR_R);
+        EXPECT_NEAR(g0, g1, 2);
+        EXPECT_NEAR(b0, b1, ERROR_B);
+        ++rh[r1 - r0 + 128];
+        ++gh[g1 - g0 + 128];
+        ++bh[b1 - b0 + 128];
+      }
+    }
+  }
+  PrintHistogram(rh, gh, bh);
+}
 #undef FASTSTEP

 TEST_F(LibYUVColorTest, TestGreyYUVJ) {
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -558,7 +558,7 @@ TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
 TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
 TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)

-// Provide matrix wrappers
+// Provide matrix wrappers for full range bt.709
 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
  I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
 #define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
@ -572,6 +572,20 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
 #define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
  I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)

+// Provide matrix wrappers for full range bt.2020
+#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
+#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
+  I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
+
 #define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))

 #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
@ -643,6 +657,8 @@ TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
 TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
+TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
@ -667,6 +683,8 @@ TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
 TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
@ -677,6 +695,8 @@ TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
+TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
@ -772,6 +792,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                        l, m)
+#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 #define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
                        l, m)
@ -796,6 +822,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                        l, m)
+#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
 #define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
                        l, m)
@ -820,6 +852,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 #define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
                        l, m)
+#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)
+#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m)               \
+  I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
+                        l, m)

 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
@ -829,6 +867,8 @@ TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
 TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
@ -837,6 +877,8 @@ TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
@ -845,6 +887,8 @@ TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
 TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
+TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)

 #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
                         BPP_B, W1280, N, NEG, OFF)                            \
@ -2771,6 +2815,8 @@ TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
 TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
@ -2781,6 +2827,8 @@ TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
+TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
 TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
 TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@ -2862,6 +2910,8 @@ TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
@ -2872,6 +2922,8 @@ TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
@ -2880,6 +2932,8 @@ TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
 TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
 TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
+TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)

 #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
                      OFF, FMT_C, BPP_C)                                       \
--- a/util/color.cc
+++ b/util/color.cc
@ -0,0 +1,118 @@
+/*
+ *  Copyright 2021 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This utility computes values needed to generate yuvconstants based on
+// white point values.
+// The yuv formulas are tuned for 8 bit YUV channels.
+
+// For those MCs that can be represented as kr and kb:
+// Full range
+// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
+// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
+// Limited range
+// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
+// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
+
+// mc bt
+// 1 bt.709      KR = 0.2126; KB = 0.0722
+// 4 fcc         KR = 0.30;   KB = 0.11
+// 6 bt.601      KR = 0.299;  KB = 0.114
+// 7 SMPTE 240M  KR = 0.212;  KB = 0.087
+// 10 bt2020     KR = 0.2627; KB = 0.0593
+
+// BT.709 full range YUV to RGB reference
+//  R = Y               + V * 1.5748
+//  G = Y - U * 0.18732 - V * 0.46812
+//  B = Y + U * 1.8556
+//  KR = 0.2126
+//  KB = 0.0722
+
+// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
+
+// // Y contribution to R,G,B.  Scale and bias.
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+// #define UB 113 /* round(1.77200 * 64) */
+// #define UG 22  /* round(0.34414 * 64) */
+// #define VG 46  /* round(0.71414 * 64) */
+// #define VR 90  /* round(1.40200 * 64) */
+//
+// // Bias values to round, and subtract 128 from U and V.
+// #define BB (-UB * 128 + YB)
+// #define BG (UG * 128 + VG * 128 + YB)
+// #define BR (-VR * 128 + YB)
+
+int round(float v) {
+    return (int) (v + 0.5);
+}
+
+int main(int argc, const char* argv[]) {
+
+  if (argc < 2) {
+    printf("color kr kb\n");
+    return -1;
+  }
+  float kr = atof(argv[1]);
+  float kb = atof(argv[2]);
+  float kg = 1 - kr - kb;
+
+  float vr = 2 * (1 - kr);
+  float ug = 2 * ((1 - kb) * kb / kg);
+  float vg = 2 * ((1 - kr) * kr / kg);
+  float ub = 2 * (1 - kb);
+
+  printf("Full range\n");
+  printf("R = Y                + V * %5f\n", vr);
+  printf("G = Y - U * %6f - V * %6f\n", ug, vg);
+  printf("B = Y + U * %5f\n", ub);
+
+  printf("KR = %4f; ", kr);
+  printf("KB = %4f\n", kb);
+//  printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  vr = 255.f / 224.f * 2 * (1 - kr);
+  ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
+  vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
+  ub = 255.f / 224.f * 2 * (1 - kb);
+
+  printf("Limited range\n");
+  printf("R = (Y - 16) * 1.164                + V * %5f\n", vr);
+  printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
+  printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
+
+//  printf("KG = %4f\n", kg);
+// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
+// #define YB 32    /* 64 / 2 */
+//
+// // U and V contributions to R,G,B.
+
+  printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
+  printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
+  printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
+  printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
+
+  return 0;
+}
+