Add 12 bit YUV to 10 bit RGB

Bug: libyuv:843 Change-Id: I0104c8fcaeed09e83d2fd654c6a5e7d41bcb74cf Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2727775 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com>
2025-12-06 16:56:55 +08:00 · 2021-03-04 12:33:02 -08:00 · 2021-03-04 12:33:02 -08:00 · ba033a11e3
commit ba033a11e3
parent 95ff456c33
14 changed files with 1127 additions and 517 deletions
--- a/docs/formats.md
+++ b/docs/formats.md
@ -4,7 +4,9 @@ Formats (FOURCC) supported by libyuv are detailed here.
 # Core Formats
-There are 2 core formats supported by libyuv - I420 and ARGB.  All YUV formats can be converted to/from I420.  All RGB formats can be converted to/from ARGB.
+There are 2 core formats supported by libyuv - I420 and ARGB.
  All YUV formats can be converted to/from I420.
  All RGB formats can be converted to/from ARGB.
 Filtering functions such as scaling and planar functions work on I420 and/or ARGB.
@ -109,6 +111,27 @@ The following is extracted from video_common.h as a complete list of formats sup
        I444, NV24 and NV42 are full width, full height
        I400 and J400 have no chroma channel.
 # Color space
      The YUV formats start with a letter to specify the color space. e.g. I420
        I = BT.601 limited range
        J = BT.601 full range     (J = JPEG that uses this)
        H = BT.709 limited range  (H for HD)
        F = BT.709 full range     (F for Full range)
        U = BT.2020 limited range (U for UHD)
        V = BT.2020 full range
        For YUV to RGB conversions, a matrix can be passed.  See also convert_argh.h
 # HDR formats
      Planar formats with 10 or 12 bits use the following fourcc:
        I010, I012, P010, P012 are half width, half height
        I210, I212, P210, P212 are half width, full height
        I410, I412, P410, P412 are full width, full height
      where
        I is the color space (see above) and 3 planes: Y, U and V.
        P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits.  There is a Y plane and a UV plane.
        0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4
        10 or 12 is the bits per channel.  The bits are in the low bits of a 16 bit channel.
 # The ARGB FOURCC
 There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA.  ARGB is most common by far, used for screen formats, and windows webcam drivers.
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@ -1488,6 +1488,34 @@ int I010ToARGBMatrix(const uint16_t* src_y,
                     int width,
                     int height);
 // multiply 12 bit yuv into high bits to allow any number of bits.
 LIBYUV_API
 int I012ToAR30Matrix(const uint16_t* src_y,
                     int src_stride_y,
                     const uint16_t* src_u,
                     int src_stride_u,
                     const uint16_t* src_v,
                     int src_stride_v,
                     uint8_t* dst_ar30,
                     int dst_stride_ar30,
                     const struct YuvConstants* yuvconstants,
                     int width,
                     int height);
 // Convert 12 bit YUV to ARGB with matrix.
 LIBYUV_API
 int I012ToARGBMatrix(const uint16_t* src_y,
                     int src_stride_y,
                     const uint16_t* src_u,
                     int src_stride_u,
                     const uint16_t* src_v,
                     int src_stride_v,
                     uint8_t* dst_argb,
                     int dst_stride_argb,
                     const struct YuvConstants* yuvconstants,
                     int width,
                     int height);
 // Convert 10 bit 422 YUV to ARGB with matrix.
 LIBYUV_API
 int I210ToARGBMatrix(const uint16_t* src_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -282,6 +282,8 @@ extern "C" {
 #define HAS_HALFMERGEUVROW_SSSE3
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I212TOAR30ROW_SSSE3
 #define HAS_I212TOARGBROW_SSSE3
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_I410TOAR30ROW_SSSE3
@ -320,6 +322,8 @@ extern "C" {
 #define HAS_MERGEARGBROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I212TOAR30ROW_AVX2
 #define HAS_I212TOARGBROW_AVX2
 #define HAS_I400TOARGBROW_AVX2
 #define HAS_I410TOAR30ROW_AVX2
 #define HAS_I410TOARGBROW_AVX2
@ -721,9 +725,15 @@ struct YuvConstants {
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
 #if LIBYUV_UNLIMITED_DATA
  uint8_t kUVToB[32];
  uint8_t kUVToG[32];
  uint8_t kUVToR[32];
 #else
  int8_t kUVToB[32];
  int8_t kUVToG[32];
  int8_t kUVToR[32];
 #endif
  int16_t kUVBiasB[16];
  int16_t kUVBiasG[16];
  int16_t kUVBiasR[16];
@ -2040,10 +2050,10 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
                        int depth,
                        int width);
 void MergeUVRow_16_Any_AVX2(const uint16_t* src_u,
-                        const uint16_t* src_v,
+                            const uint16_t* src_v,
-                        uint16_t* dst_uv,
+                            uint16_t* dst_uv,
-                        int depth,
+                            int depth,
-                        int width);
+                            int width);
 void MergeUVRow_16_NEON(const uint16_t* src_u,
                        const uint16_t* src_v,
                        uint16_t* dst_uv,
@ -2591,6 +2601,18 @@ void I210ToARGBRow_C(const uint16_t* src_y,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
                     int width);
 void I212ToAR30Row_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
                     int width);
 void I212ToARGBRow_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
                     int width);
 void I410ToAR30Row_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
@ -2617,7 +2639,6 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
                          uint8_t* rgb_buf,
                          const struct YuvConstants* yuvconstants,
                          int width);
 void I444AlphaToARGBRow_C(const uint8_t* src_y,
                          const uint8_t* src_u,
                          const uint8_t* src_v,
@ -2769,6 +2790,18 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
 void I212ToAR30Row_SSSE3(const uint16_t* y_buf,
                         const uint16_t* u_buf,
                         const uint16_t* v_buf,
                         uint8_t* dst_ar30,
                         const struct YuvConstants* yuvconstants,
                         int width);
 void I212ToARGBRow_SSSE3(const uint16_t* y_buf,
                         const uint16_t* u_buf,
                         const uint16_t* v_buf,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
 void I410ToAR30Row_SSSE3(const uint16_t* src_y,
                         const uint16_t* src_u,
                         const uint16_t* src_v,
@ -2813,6 +2846,18 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width);
 void I212ToARGBRow_AVX2(const uint16_t* y_buf,
                        const uint16_t* u_buf,
                        const uint16_t* v_buf,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
 void I212ToAR30Row_AVX2(const uint16_t* y_buf,
                        const uint16_t* u_buf,
                        const uint16_t* v_buf,
                        uint8_t* dst_ar30,
                        const struct YuvConstants* yuvconstants,
                        int width);
 void I410ToAR30Row_AVX2(const uint16_t* src_y,
                        const uint16_t* src_u,
                        const uint16_t* src_v,
@ -3081,6 +3126,18 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
 void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
                             const uint16_t* u_buf,
                             const uint16_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
 void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
                             const uint16_t* u_buf,
                             const uint16_t* v_buf,
                             uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
 void I410ToAR30Row_Any_SSSE3(const uint16_t* src_y,
                             const uint16_t* src_u,
                             const uint16_t* src_v,
@ -3125,6 +3182,18 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
 void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf,
                            const uint16_t* u_buf,
                            const uint16_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
 void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf,
                            const uint16_t* u_buf,
                            const uint16_t* v_buf,
                            uint8_t* dst_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
 void I410ToAR30Row_Any_AVX2(const uint16_t* src_y,
                            const uint16_t* src_u,
                            const uint16_t* src_v,
@ -3788,25 +3857,25 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
                            const struct YuvConstants* yuvconstants,
                            int width);
 void P210ToARGBRow_NEON(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
+                        const uint16_t* uv_buf,
-                             uint8_t* dst_argb,
+                        uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
+                        const struct YuvConstants* yuvconstants,
-                             int width);
+                        int width);
 void P410ToARGBRow_NEON(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
+                        const uint16_t* uv_buf,
-                             uint8_t* dst_argb,
+                        uint8_t* dst_argb,
-                             const struct YuvConstants* yuvconstants,
+                        const struct YuvConstants* yuvconstants,
-                             int width);
+                        int width);
 void P210ToAR30Row_NEON(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
+                        const uint16_t* uv_buf,
-                             uint8_t* dst_ar30,
+                        uint8_t* dst_ar30,
-                             const struct YuvConstants* yuvconstants,
+                        const struct YuvConstants* yuvconstants,
-                             int width);
+                        int width);
 void P410ToAR30Row_NEON(const uint16_t* y_buf,
-                             const uint16_t* uv_buf,
+                        const uint16_t* uv_buf,
-                             uint8_t* dst_ar30,
+                        uint8_t* dst_ar30,
-                             const struct YuvConstants* yuvconstants,
+                        const struct YuvConstants* yuvconstants,
-                             int width);
+                        int width);
 void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
                            const uint16_t* uv_buf,
                            uint8_t* dst_argb,
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -626,13 +626,13 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width);
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
-                                 uint16_t* dst_ptr,
+                                uint16_t* dst_ptr,
-                                 int dst_width);
+                                int dst_width);
 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
-                                   ptrdiff_t src_stride,
+                                  ptrdiff_t src_stride,
-                                   uint16_t* dst_ptr,
+                                  uint16_t* dst_ptr,
-                                   ptrdiff_t dst_stride,
+                                  ptrdiff_t dst_stride,
-                                   int dst_width);
+                                  int dst_width);
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width);
@ -682,8 +682,8 @@ void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr,
                                       ptrdiff_t dst_stride,
                                       int dst_width);
 void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
-                                     uint16_t* dst_ptr,
+                                    uint16_t* dst_ptr,
-                                     int dst_width);
+                                    int dst_width);
 void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint16_t* dst_ptr,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -888,6 +888,63 @@ int U010ToAB30(const uint16_t* src_y,
                          &kYuv2020Constants, width, height);
 }
 // Convert 12 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to
 // multiply 12 bit yuv into high bits to allow any number of bits.
 LIBYUV_API
 int I012ToAR30Matrix(const uint16_t* src_y,
                     int src_stride_y,
                     const uint16_t* src_u,
                     int src_stride_u,
                     const uint16_t* src_v,
                     int src_stride_v,
                     uint8_t* dst_ar30,
                     int dst_stride_ar30,
                     const struct YuvConstants* yuvconstants,
                     int width,
                     int height) {
  int y;
  void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
                        const uint16_t* v_buf, uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
      I212ToAR30Row_C;
  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
    dst_stride_ar30 = -dst_stride_ar30;
  }
 #if defined(HAS_I212TOAR30ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I212ToAR30Row = I212ToAR30Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      I212ToAR30Row = I212ToAR30Row_SSSE3;
    }
  }
 #endif
 #if defined(HAS_I212TOAR30ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I212ToAR30Row = I212ToAR30Row_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I212ToAR30Row = I212ToAR30Row_AVX2;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
    I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
    dst_ar30 += dst_stride_ar30;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
      src_v += src_stride_v;
    }
  }
  return 0;
 }
 // Convert 10 bit YUV to ARGB with matrix.
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
@ -1061,7 +1118,7 @@ int I410ToAR30Matrix(const uint16_t* src_y,
  void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
                        const uint16_t* v_buf, uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
-  I410ToAR30Row_C;
+      I410ToAR30Row_C;
  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
@ -1260,6 +1317,61 @@ int U010ToABGR(const uint16_t* src_y,
                          width, height);
 }
 // Convert 12 bit YUV to ARGB with matrix.
 LIBYUV_API
 int I012ToARGBMatrix(const uint16_t* src_y,
                     int src_stride_y,
                     const uint16_t* src_u,
                     int src_stride_u,
                     const uint16_t* src_v,
                     int src_stride_v,
                     uint8_t* dst_argb,
                     int dst_stride_argb,
                     const struct YuvConstants* yuvconstants,
                     int width,
                     int height) {
  int y;
  void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
                        const uint16_t* v_buf, uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
      I212ToARGBRow_C;
  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
 #if defined(HAS_I212TOARGBROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    I212ToARGBRow = I212ToARGBRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      I212ToARGBRow = I212ToARGBRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_I212TOARGBROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    I212ToARGBRow = I212ToARGBRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      I212ToARGBRow = I212ToARGBRow_AVX2;
    }
  }
 #endif
  for (y = 0; y < height; ++y) {
    I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
    dst_argb += dst_stride_argb;
    src_y += src_stride_y;
    if (y & 1) {
      src_u += src_stride_u;
      src_v += src_stride_v;
    }
  }
  return 0;
 }
 // Convert 10 bit 422 YUV to ARGB with matrix.
 LIBYUV_API
 int I210ToARGBMatrix(const uint16_t* src_y,
@ -1437,7 +1549,7 @@ int I410ToARGBMatrix(const uint16_t* src_y,
  void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
                        const uint16_t* v_buf, uint8_t* rgb_buf,
                        const struct YuvConstants* yuvconstants, int width) =
-  I410ToARGBRow_C;
+      I410ToARGBRow_C;
  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
@ -1484,9 +1596,9 @@ int P010ToARGBMatrix(const uint16_t* src_y,
                     int width,
                     int height) {
  int y;
-  void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+  void (*P210ToARGBRow)(
-                        const struct YuvConstants* yuvconstants, int width) =
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-  P210ToARGBRow_C;
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
@ -1534,9 +1646,9 @@ int P210ToARGBMatrix(const uint16_t* src_y,
                     int width,
                     int height) {
  int y;
-  void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+  void (*P210ToARGBRow)(
-                        const struct YuvConstants* yuvconstants, int width) =
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-  P210ToARGBRow_C;
+      const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C;
  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
@ -1582,9 +1694,9 @@ int P010ToAR30Matrix(const uint16_t* src_y,
                     int width,
                     int height) {
  int y;
-  void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+  void (*P210ToAR30Row)(
-                        const struct YuvConstants* yuvconstants, int width) =
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-  P210ToAR30Row_C;
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
@ -1632,9 +1744,9 @@ int P210ToAR30Matrix(const uint16_t* src_y,
                     int width,
                     int height) {
  int y;
-  void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
+  void (*P210ToAR30Row)(
-                        const struct YuvConstants* yuvconstants, int width) =
+      const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf,
-  P210ToAR30Row_C;
+      const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C;
  if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) {
    return -1;
  }
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -138,19 +138,47 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
  }
 #ifdef HAS_I210ALPHATOARGBROW_SSSE3
-ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+ANY41CT(I210AlphaToARGBRow_Any_SSSE3,
        I210AlphaToARGBRow_SSSE3,
        1,
        0,
        uint16_t,
        2,
        4,
        7)
 #endif
 #ifdef HAS_I210ALPHATOARGBROW_AVX2
-ANY41CT(I210AlphaToARGBRow_Any_AVX2, I210AlphaToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+ANY41CT(I210AlphaToARGBRow_Any_AVX2,
        I210AlphaToARGBRow_AVX2,
        1,
        0,
        uint16_t,
        2,
        4,
        15)
 #endif
 #ifdef HAS_I410ALPHATOARGBROW_SSSE3
-ANY41CT(I410AlphaToARGBRow_Any_SSSE3, I410AlphaToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7)
+ANY41CT(I410AlphaToARGBRow_Any_SSSE3,
        I410AlphaToARGBRow_SSSE3,
        0,
        0,
        uint16_t,
        2,
        4,
        7)
 #endif
 #ifdef HAS_I410ALPHATOARGBROW_AVX2
-ANY41CT(I410AlphaToARGBRow_Any_AVX2, I410AlphaToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15)
+ANY41CT(I410AlphaToARGBRow_Any_AVX2,
        I410AlphaToARGBRow_AVX2,
        0,
        0,
        uint16_t,
        2,
        4,
        15)
 #endif
 #undef ANY41CT
@ -382,6 +410,18 @@ ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
 #ifdef HAS_I210TOARGBROW_MMI
 ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7)
 #endif
 #ifdef HAS_I212TOAR30ROW_SSSE3
 ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
 #endif
 #ifdef HAS_I212TOARGBROW_SSSE3
 ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
 #endif
 #ifdef HAS_I212TOARGBROW_AVX2
 ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
 #ifdef HAS_I212TOAR30ROW_AVX2
 ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
 #endif
 #undef ANY31CT
 // Any 2 planes to 1.
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -21,6 +21,11 @@ namespace libyuv {
 extern "C" {
 #endif
 // These 2 macros control YUV to RGB using unsigned math to extend range.
 // They can be used separately to enable new code and old data (clamped)
 // LIBYUV_UNLIMITED_DATA
 // LIBYUV_UNLIMITED_CODE
 // The following ifdef from row_win makes the C code match the row_win code,
 // which is 7 bit fixed point.
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
@ -1395,7 +1400,11 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // KR = 0.299; KB = 0.114
 // U and V contributions to R,G,B.
 #if LIBYUV_UNLIMITED_DATA
 #define UB 129 /* round(2.018 * 64) */
 #else
 #define UB 128 /* max(128, round(2.018 * 64)) */
 #endif
 #define UG 25  /* round(0.391 * 64) */
 #define VG 52  /* round(0.813 * 64) */
 #define VR 102 /* round(1.596 * 64) */
@ -1444,9 +1453,12 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164 + U * 2.112
 //  KR = 0.2126, KB = 0.0722
 // TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
 #if LIBYUV_UNLIMITED_DATA
 #define UB 135 /* round(2.112 * 64) */
 #else
 #define UB 128 /* max(128, round(2.112 * 64)) */
 #endif
 #define UG 14  /* round(0.213 * 64) */
 #define VG 34  /* round(0.533 * 64) */
 #define VR 115 /* round(1.793 * 64) */
@ -1495,9 +1507,12 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 //  B = (Y - 16) * 1.164384 + U * 2.14177
 // KR = 0.2627; KB = 0.0593
 // TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
 // U and V contributions to R,G,B.
 #if LIBYUV_UNLIMITED_DATA
 #define UB 137 /* round(2.142 * 64) */
 #else
 #define UB 128 /* max(128, round(2.142 * 64)) */
 #endif
 #define UG 12  /* round(0.187326 * 64) */
 #define VG 42  /* round(0.65042 * 64) */
 #define VR 107 /* round(1.67867 * 64) */
@ -1545,15 +1560,61 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #undef MAKEYUVCONSTANTS
 #if LIBYUV_UNLIMITED_DATA
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 16 bit.
 static __inline void YuvPixel(uint8_t y,
                              uint8_t u,
                              uint8_t v,
                              uint8_t* b,
                              uint8_t* g,
                              uint8_t* r,
                              const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
  int ub = yuvconstants->kUVToRB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[1];
  int vr = yuvconstants->kUVToRB[1];
  int bb = yuvconstants->kUVBiasBGR[0];
  int bg = yuvconstants->kUVBiasBGR[1];
  int br = yuvconstants->kUVBiasBGR[2];
  int yg = yuvconstants->kYToRgb[1];
 #elif defined(__arm__)
  int ub = yuvconstants->kUVToRB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[4];
  int vr = yuvconstants->kUVToRB[4];
  int bb = yuvconstants->kUVBiasBGR[0];
  int bg = yuvconstants->kUVBiasBGR[1];
  int br = yuvconstants->kUVBiasBGR[2];
  int yg = yuvconstants->kYToRgb[1];
 #else
  int ub = -yuvconstants->kUVToB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[1];
  int vr = -yuvconstants->kUVToR[1];
  int bb = yuvconstants->kUVBiasB[0];
  int bg = yuvconstants->kUVBiasG[0];
  int br = yuvconstants->kUVBiasR[0];
  int yg = yuvconstants->kYToRgb[0];
 #endif
  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
  *b = Clamp((int32_t)(y1 + (u * ub) + bb) >> 6);
  *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
  *r = Clamp((int32_t)(y1 + (v * vr) + br) >> 6);
 }
 #else
 // C reference code that mimics the YUV assembly.
 // Reads 8 bit YUV and leaves result as 8 bit.
-static __inline void YuvPixel8_8(uint8_t y,
+static __inline void YuvPixel(uint8_t y,
-                                 uint8_t u,
+                              uint8_t u,
-                                 uint8_t v,
+                              uint8_t v,
-                                 uint8_t* b,
+                              uint8_t* b,
-                                 uint8_t* g,
+                              uint8_t* g,
-                                 uint8_t* r,
+                              uint8_t* r,
-                                 const struct YuvConstants* yuvconstants) {
+                              const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
  int ub = -yuvconstants->kUVToRB[0];
  int ug = yuvconstants->kUVToG[0];
@ -1584,10 +1645,11 @@ static __inline void YuvPixel8_8(uint8_t y,
 #endif
  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
+  *b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6);
-  *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
+  *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
-  *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
+  *r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6);
 }
 #endif
 // Reads 8 bit YUV and leaves result as 16 bit.
 static __inline void YuvPixel8_16(uint8_t y,
@ -1627,9 +1689,9 @@ static __inline void YuvPixel8_16(uint8_t y,
 #endif
  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = (int)(-(u * ub) + y1 + bb);
+  *b = (int)(y1 - (u * ub) + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *g = (int)(y1 - (u * ug + v * vg) + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  *r = (int)(y1 - (v * vr) + br);
 }
 // C reference code that mimics the YUV 16 bit assembly.
@ -1678,15 +1740,61 @@ static __inline void YuvPixel10_16(uint16_t y,
  *r = (int)(-(v * vr) + y1 + br);
 }
 // C reference code that mimics the YUV 16 bit assembly.
 // Reads 12 bit YUV and leaves result as 16 bit.
 static __inline void YuvPixel12_16(int16_t y,
                                   int16_t u,
                                   int16_t v,
                                   int* b,
                                   int* g,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
  int ub = -yuvconstants->kUVToRB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[1];
  int vr = -yuvconstants->kUVToRB[1];
  int bb = yuvconstants->kUVBiasBGR[0];
  int bg = yuvconstants->kUVBiasBGR[1];
  int br = yuvconstants->kUVBiasBGR[2];
  int yg = yuvconstants->kYToRgb[1];
 #elif defined(__arm__)
  int ub = -yuvconstants->kUVToRB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[4];
  int vr = -yuvconstants->kUVToRB[4];
  int bb = yuvconstants->kUVBiasBGR[0];
  int bg = yuvconstants->kUVBiasBGR[1];
  int br = yuvconstants->kUVBiasBGR[2];
  int yg = yuvconstants->kYToRgb[1];
 #else
  int ub = yuvconstants->kUVToB[0];
  int ug = yuvconstants->kUVToG[0];
  int vg = yuvconstants->kUVToG[1];
  int vr = yuvconstants->kUVToR[1];
  int bb = yuvconstants->kUVBiasB[0];
  int bg = yuvconstants->kUVBiasG[0];
  int br = yuvconstants->kUVBiasR[0];
  int yg = yuvconstants->kYToRgb[0];
 #endif
  uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16;
  u = clamp255(u >> 4);
  v = clamp255(v >> 4);
  *b = (int)(-(u * ub) + y1 + bb);
  *g = (int)(-(u * ug + v * vg) + y1 + bg);
  *r = (int)(-(v * vr) + y1 + br);
 }
 // C reference code that mimics the YUV 10 bit assembly.
 // Reads 10 bit YUV and clamps down to 8 bit RGB.
-static __inline void YuvPixel10_8(uint16_t y,
+static __inline void YuvPixel10(uint16_t y,
-                                  uint16_t u,
+                                uint16_t u,
-                                  uint16_t v,
+                                uint16_t v,
-                                  uint8_t* b,
+                                uint8_t* b,
-                                  uint8_t* g,
+                                uint8_t* g,
-                                  uint8_t* r,
+                                uint8_t* r,
-                                  const struct YuvConstants* yuvconstants) {
+                                const struct YuvConstants* yuvconstants) {
  int b16;
  int g16;
  int r16;
@ -1696,6 +1804,24 @@ static __inline void YuvPixel10_8(uint16_t y,
  *r = Clamp(r16 >> 6);
 }
 // C reference code that mimics the YUV 12 bit assembly.
 // Reads 12 bit YUV and clamps down to 8 bit RGB.
 static __inline void YuvPixel12(uint16_t y,
                                uint16_t u,
                                uint16_t v,
                                uint8_t* b,
                                uint8_t* g,
                                uint8_t* r,
                                const struct YuvConstants* yuvconstants) {
  int b16;
  int g16;
  int r16;
  YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants);
  *b = Clamp(b16 >> 6);
  *g = Clamp(g16 >> 6);
  *r = Clamp(r16 >> 6);
 }
 // C reference code that mimics the YUV 16 bit assembly.
 // Reads 16 bit YUV and leaves result as 8 bit.
 static __inline void YuvPixel16_8(uint16_t y,
@ -1783,9 +1909,9 @@ static __inline void YuvPixel16_16(uint16_t y,
  uint32_t y1 = (uint32_t)(y * yg) >> 16;
  u = clamp255(u >> 8);
  v = clamp255(v >> 8);
-  *b = (int)(-(u * ub) + y1 + bb);
+  *b = (int)(y1 + -(u * ub) + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *g = (int)(y1 + -(u * ug + v * vg) + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  *r = (int)(y1 + -(v * vr) + br);
 }
 // C reference code that mimics the YUV assembly.
@ -1822,11 +1948,11 @@ void I444ToARGBRow_C(const uint8_t* src_y,
  for (x = 0; x < width - 1; x += 2) {
    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-                yuvconstants);
+             yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-                yuvconstants);
+             yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_u += 2;
@ -1834,8 +1960,8 @@ void I444ToARGBRow_C(const uint8_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -1848,8 +1974,8 @@ void I444ToARGBRow_C(const uint8_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width; ++x) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
    src_y += 1;
    src_u += 1;
@ -1868,11 +1994,11 @@ void I422ToARGBRow_C(const uint8_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_u += 1;
@ -1880,8 +2006,8 @@ void I422ToARGBRow_C(const uint8_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -1895,11 +2021,11 @@ void I210ToARGBRow_C(const uint16_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-                 rgb_buf + 6, yuvconstants);
+               rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_u += 1;
@ -1907,8 +2033,8 @@ void I210ToARGBRow_C(const uint16_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -1921,8 +2047,8 @@ void I410ToARGBRow_C(const uint16_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width; ++x) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
    src_y += 1;
    src_u += 1;
@ -1940,11 +2066,11 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
                          int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = clamp255(src_a[0] >> 2);
-    YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-                 rgb_buf + 6, yuvconstants);
+               rgb_buf + 6, yuvconstants);
    rgb_buf[7] = clamp255(src_a[1] >> 2);
    src_y += 2;
    src_u += 1;
@ -1953,8 +2079,8 @@ void I210AlphaToARGBRow_C(const uint16_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = clamp255(src_a[0] >> 2);
  }
 }
@ -1968,8 +2094,8 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
                          int width) {
  int x;
  for (x = 0; x < width; ++x) {
-    YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                 rgb_buf + 2, yuvconstants);
+               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = clamp255(src_a[0] >> 2);
    src_y += 1;
    src_u += 1;
@ -1979,6 +2105,33 @@ void I410AlphaToARGBRow_C(const uint16_t* src_y,
  }
 }
 // 12 bit YUV to ARGB
 void I212ToARGBRow_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
    YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
               rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_u += 1;
    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
    YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
               rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
 static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
  uint32_t ar30;
  b = b >> 4;  // convert 8 bit 10.6 to 10 bit.
@ -2018,6 +2171,33 @@ void I210ToAR30Row_C(const uint16_t* src_y,
  }
 }
 // 12 bit YUV to 10 bit AR30
 void I212ToAR30Row_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
                     uint8_t* rgb_buf,
                     const struct YuvConstants* yuvconstants,
                     int width) {
  int x;
  int b;
  int g;
  int r;
  for (x = 0; x < width - 1; x += 2) {
    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
    StoreAR30(rgb_buf, b, g, r);
    YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
    StoreAR30(rgb_buf + 4, b, g, r);
    src_y += 2;
    src_u += 1;
    src_v += 1;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
    YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
    StoreAR30(rgb_buf, b, g, r);
  }
 }
 void I410ToAR30Row_C(const uint16_t* src_y,
                     const uint16_t* src_u,
                     const uint16_t* src_v,
@ -2038,6 +2218,7 @@ void I410ToAR30Row_C(const uint16_t* src_y,
  }
 }
 // P210 has 10 bits in msb of 16 bit NV12 style layout.
 void P210ToARGBRow_C(const uint16_t* src_y,
                     const uint16_t* src_uv,
                     uint8_t* rgb_buf,
@ -2163,11 +2344,11 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
  for (x = 0; x < width - 1; x += 2) {
    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
-    YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
-                yuvconstants);
+             yuvconstants);
    rgb_buf[3] = src_a[0];
-    YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6,
-                yuvconstants);
+             yuvconstants);
    rgb_buf[7] = src_a[1];
    src_y += 2;
    src_u += 2;
@ -2176,8 +2357,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = src_a[0];
  }
 }
@ -2191,8 +2372,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y,
                          int width) {
  int x;
  for (x = 0; x < width; ++x) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = src_a[0];
    src_y += 1;
    src_u += 1;
@ -2212,11 +2393,11 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y,
                          int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = src_a[0];
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = src_a[1];
    src_y += 2;
    src_u += 1;
@ -2225,8 +2406,8 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = src_a[0];
  }
 }
@ -2239,18 +2420,18 @@ void I422ToRGB24Row_C(const uint8_t* src_y,
                      int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
-                rgb_buf + 5, yuvconstants);
+             rgb_buf + 5, yuvconstants);
    src_y += 2;
    src_u += 1;
    src_v += 1;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
  }
 }
@ -2268,8 +2449,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
  uint8_t r1;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
    b0 = b0 >> 4;
    g0 = g0 >> 4;
    r0 = r0 >> 4;
@ -2284,7 +2465,7 @@ void I422ToARGB4444Row_C(const uint8_t* src_y,
    dst_argb4444 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
    b0 = b0 >> 4;
    g0 = g0 >> 4;
    r0 = r0 >> 4;
@ -2306,8 +2487,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
  uint8_t r1;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 3;
    r0 = r0 >> 3;
@ -2322,7 +2503,7 @@ void I422ToARGB1555Row_C(const uint8_t* src_y,
    dst_argb1555 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 3;
    r0 = r0 >> 3;
@ -2344,8 +2525,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
  uint8_t r1;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -2360,7 +2541,7 @@ void I422ToRGB565Row_C(const uint8_t* src_y,
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -2375,19 +2556,19 @@ void NV12ToARGBRow_C(const uint8_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_uv += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -2399,19 +2580,19 @@ void NV21ToARGBRow_C(const uint8_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_y += 2;
    src_vu += 2;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -2423,17 +2604,17 @@ void NV12ToRGB24Row_C(const uint8_t* src_y,
                      int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
-    YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
-                rgb_buf + 5, yuvconstants);
+             rgb_buf + 5, yuvconstants);
    src_y += 2;
    src_uv += 2;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
  }
 }
@ -2444,17 +2625,17 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
                      int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
-    YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
-                rgb_buf + 5, yuvconstants);
+             rgb_buf + 5, yuvconstants);
    src_y += 2;
    src_vu += 2;
    rgb_buf += 6;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
  }
 }
@ -2471,8 +2652,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
  uint8_t r1;
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
-    YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -2486,7 +2667,7 @@ void NV12ToRGB565Row_C(const uint8_t* src_y,
    dst_rgb565 += 4;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
    b0 = b0 >> 3;
    g0 = g0 >> 2;
    r0 = r0 >> 3;
@ -2500,18 +2681,18 @@ void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_yuy2 += 4;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -2522,18 +2703,18 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
-    YuvPixel8_8(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
-                rgb_buf + 6, yuvconstants);
+             rgb_buf + 6, yuvconstants);
    rgb_buf[7] = 255;
    src_uyvy += 4;
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
-                rgb_buf + 2, yuvconstants);
+             rgb_buf + 2, yuvconstants);
    rgb_buf[3] = 255;
  }
 }
@ -2546,11 +2727,11 @@ void I422ToRGBARow_C(const uint8_t* src_y,
                     int width) {
  int x;
  for (x = 0; x < width - 1; x += 2) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
-                rgb_buf + 3, yuvconstants);
+             rgb_buf + 3, yuvconstants);
    rgb_buf[0] = 255;
-    YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
-                rgb_buf + 7, yuvconstants);
+             rgb_buf + 7, yuvconstants);
    rgb_buf[4] = 255;
    src_y += 2;
    src_u += 1;
@ -2558,8 +2739,8 @@ void I422ToRGBARow_C(const uint8_t* src_y,
    rgb_buf += 8;  // Advance 2 pixels.
  }
  if (width & 1) {
-    YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
-                rgb_buf + 3, yuvconstants);
+             rgb_buf + 3, yuvconstants);
    rgb_buf[0] = 255;
  }
 }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -2001,6 +2001,19 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
  "packuswb   %%xmm5,%%xmm5                                   \n" \
  "lea        0x10(%[a_buf]),%[a_buf]                         \n"
 // Read 4 UV from 422 12 bit, upsample to 8 UV
 #define READYUV212                                                \
  "movq       (%[u_buf]),%%xmm0                               \n" \
  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
  "psraw      $0x4,%%xmm0                                     \n" \
  "packuswb   %%xmm0,%%xmm0                                   \n" \
  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
  "movdqu     (%[y_buf]),%%xmm4                               \n" \
  "psllw      $0x4,%%xmm4                                     \n" \
  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
 #define READYUVA422                                               \
  "movd       (%[u_buf]),%%xmm0                               \n" \
@ -2398,6 +2411,36 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
  );
 }
 // 12 bit YUV to ARGB
 void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf,
                                const uint16_t* u_buf,
                                const uint16_t* v_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
      "sub         %[u_buf],%[v_buf]             \n"
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
    LABELALIGN
      "1:                                        \n"
    READYUV212
    YUVTORGB(yuvconstants)
    STOREARGB
      "sub         $0x8,%[width]                 \n"
      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
 // 10 bit YUV to AR30
 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
                                const uint16_t* u_buf,
@ -2433,6 +2476,41 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
  );
 }
 // 12 bit YUV to AR30
 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
                                const uint16_t* u_buf,
                                const uint16_t* v_buf,
                                uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
      "sub         %[u_buf],%[v_buf]             \n"
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
      "pxor        %%xmm6,%%xmm6                 \n"
      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
    LABELALIGN
      "1:                                        \n"
    READYUV212
    YUVTORGB16(yuvconstants)
    STOREAR30
      "sub         $0x8,%[width]                 \n"
      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 // 10 bit YUV to ARGB
 void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                const uint16_t* u_buf,
@ -2443,16 +2521,16 @@ void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
-    "sub         %[u_buf],%[v_buf]             \n"
+      "sub         %[u_buf],%[v_buf]             \n"
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READYUV410
    YUVTORGB(yuvconstants)
    STOREARGB
-    "sub         $0x8,%[width]                 \n"
+      "sub         $0x8,%[width]                 \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
@ -2474,29 +2552,26 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                     int width) {
  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
+      YUVTORGB_SETUP(
-    "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READYUVA210
-    "1:                                        \n"
+          YUVTORGB(yuvconstants) STOREARGB
-    READYUVA210
+      "subl        $0x8,%[width]                 \n"
-    YUVTORGB(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB
+      : [y_buf] "+r"(y_buf),  // %[y_buf]
-    "subl        $0x8,%[width]                 \n"
+        [u_buf] "+r"(u_buf),  // %[u_buf]
-    "jg          1b                            \n"
+        [v_buf] "+r"(v_buf),  // %[v_buf]
-  : [y_buf] "+r"(y_buf),  // %[y_buf]
+        [a_buf] "+r"(a_buf),
-    [u_buf] "+r"(u_buf),  // %[u_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
    [v_buf] "+r"(v_buf),  // %[v_buf]
    [a_buf] "+r"(a_buf),
    [dst_argb] "+r"(dst_argb),         // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-  );
+        "xmm5");
 }
 #endif
@ -2511,29 +2586,26 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf,
                                     int width) {
  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
+      YUVTORGB_SETUP(
-    "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READYUVA410
-    "1:                                        \n"
+          YUVTORGB(yuvconstants) STOREARGB
-    READYUVA410
+      "subl        $0x8,%[width]                 \n"
-    YUVTORGB(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB
+      : [y_buf] "+r"(y_buf),  // %[y_buf]
-    "subl        $0x8,%[width]                 \n"
+        [u_buf] "+r"(u_buf),  // %[u_buf]
-    "jg          1b                            \n"
+        [v_buf] "+r"(v_buf),  // %[v_buf]
-  : [y_buf] "+r"(y_buf),  // %[y_buf]
+        [a_buf] "+r"(a_buf),
-    [u_buf] "+r"(u_buf),  // %[u_buf]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
    [v_buf] "+r"(v_buf),  // %[v_buf]
    [a_buf] "+r"(a_buf),
    [dst_argb] "+r"(dst_argb),         // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants] "r"(yuvconstants)   // %[yuvconstants]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-  );
+        "xmm5");
 }
 #endif
@ -2547,21 +2619,21 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
-    "sub         %[u_buf],%[v_buf]             \n"
+      "sub         %[u_buf],%[v_buf]             \n"
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-    "psrlw       $14,%%xmm5                    \n"
+      "psrlw       $14,%%xmm5                    \n"
-    "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-    "pxor        %%xmm6,%%xmm6                 \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
-    "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-    "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READYUV410
    YUVTORGB16(yuvconstants)
    STOREAR30
-    "sub         $0x8,%[width]                 \n"
+      "sub         $0x8,%[width]                 \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
@ -2729,26 +2801,22 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
+      YUVTORGB_SETUP(
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READP210
-    "1:                                        \n"
+          YUVTORGB(yuvconstants) STOREARGB
-    READP210
+      "sub         $0x8,%[width]                 \n"
-    YUVTORGB(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
-    "sub         $0x8,%[width]                 \n"
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
-    "jg          1b                            \n"
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
-    : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [width] "+rm"(width)              // %[width]
-      [uv_buf] "+r"(uv_buf),            // %[u_buf]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-      [width] "+rm"(width)              // %[width]
+        "xmm5");
    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
    : "memory", "cc", YUVTORGB_REGS
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
 void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
@ -2756,25 +2824,22 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf,
                                uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
-  asm volatile (
+  asm volatile(
-    YUVTORGB_SETUP(yuvconstants)
+      YUVTORGB_SETUP(
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      yuvconstants) "pcmpeqb     %%xmm5,%%xmm5                 \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READP410
-    "1:                                        \n"
+          YUVTORGB(yuvconstants) STOREARGB
-    READP410
+      "sub         $0x8,%[width]                 \n"
-    YUVTORGB(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB
+      : [y_buf] "+r"(y_buf),              // %[y_buf]
-    "sub         $0x8,%[width]                 \n"
+        [uv_buf] "+r"(uv_buf),            // %[u_buf]
-    "jg          1b                            \n"
+        [dst_argb] "+r"(dst_argb),        // %[dst_argb]
-    : [y_buf] "+r"(y_buf),              // %[y_buf]
+        [width] "+rm"(width)              // %[width]
-      [uv_buf] "+r"(uv_buf),            // %[u_buf]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-      [dst_argb] "+r"(dst_argb),        // %[dst_argb]
+      : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
-      [width] "+rm"(width)              // %[width]
+        "xmm5");
    : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
    : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4",
      "xmm5");
 }
 void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
@ -2785,20 +2850,20 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-    "psrlw       $14,%%xmm5                    \n"
+      "psrlw       $14,%%xmm5                    \n"
-    "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-    "pxor        %%xmm6,%%xmm6                 \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
-    "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-    "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP210
    YUVTORGB16(yuvconstants)
    STOREAR30
-    "sub         $0x8,%[width]                 \n"
+      "sub         $0x8,%[width]                 \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),              // %[y_buf]
    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
@ -2817,20 +2882,20 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP(yuvconstants)
-    "pcmpeqb     %%xmm5,%%xmm5                 \n"
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
-    "psrlw       $14,%%xmm5                    \n"
+      "psrlw       $14,%%xmm5                    \n"
-    "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
+      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-    "pxor        %%xmm6,%%xmm6                 \n"
+      "pxor        %%xmm6,%%xmm6                 \n"
-    "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"  // 0 for min
-    "psrlw       $6,%%xmm7                     \n"  // 1023 for max
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP410
    YUVTORGB16(yuvconstants)
    STOREAR30
-    "sub         $0x8,%[width]                 \n"
+      "sub         $0x8,%[width]                 \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
  : [y_buf]"+r"(y_buf),              // %[y_buf]
    [uv_buf]"+r"(uv_buf),            // %[uv_buf]
    [dst_ar30]"+r"(dst_ar30),        // %[dst_ar30]
@ -2948,6 +3013,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
  "vpsllw     $6,%%ymm4,%%ymm4                                 \n" \
  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 // Read 8 UV from 212 12 bit, upsample to 16 UV
 #define READYUV212_AVX2                                            \
  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
  "vpsraw     $0x4,%%ymm0,%%ymm0                               \n" \
  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
  "vpsllw     $0x4,%%ymm4,%%ymm4                               \n" \
  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 // Read 16 UV from 410. With 16 Alpha.
 #define READYUVA410_AVX2                                           \
  "vmovdqu    (%[u_buf]),%%ymm0                                \n" \
@ -3295,6 +3375,41 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
 }
 #endif  // HAS_I210TOARGBROW_AVX2
 #if defined(HAS_I212TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf,
                               const uint16_t* u_buf,
                               const uint16_t* v_buf,
                               uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
      "sub         %[u_buf],%[v_buf]             \n"
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
    LABELALIGN
      "1:                                        \n"
    READYUV212_AVX2
    YUVTORGB_AVX2(yuvconstants)
    STOREARGB_AVX2
      "sub         $0x10,%[width]                \n"
      "jg          1b                            \n"
      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
 #endif  // HAS_I212TOARGBROW_AVX2
 #if defined(HAS_I210TOAR30ROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
@ -3335,6 +3450,46 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
 }
 #endif  // HAS_I210TOAR30ROW_AVX2
 #if defined(HAS_I212TOAR30ROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
 void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
                               const uint16_t* u_buf,
                               const uint16_t* v_buf,
                               uint8_t* dst_ar30,
                               const struct YuvConstants* yuvconstants,
                               int width) {
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
      "sub         %[u_buf],%[v_buf]             \n"
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
    LABELALIGN
      "1:                                        \n"
    READYUV212_AVX2
    YUVTORGB16_AVX2(yuvconstants)
    STOREAR30_AVX2
      "sub         $0x10,%[width]                \n"
      "jg          1b                            \n"
      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
 #endif  // HAS_I212TOAR30ROW_AVX2
 #if defined(HAS_I410TOARGBROW_AVX2)
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
@ -3347,17 +3502,17 @@ void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub         %[u_buf],%[v_buf]             \n"
+      "sub         %[u_buf],%[v_buf]             \n"
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READYUV410_AVX2
    YUVTORGB_AVX2(yuvconstants)
    STOREARGB_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
@ -3383,32 +3538,28 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                    int width) {
  asm volatile(
-    YUVTORGB_SETUP_AVX2(yuvconstants)
+      YUVTORGB_SETUP_AVX2(
-    "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READYUVA210_AVX2
-    "1:                                        \n"
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
-    READYUVA210_AVX2
+      "subl        $0x10,%[width]                \n"
-    YUVTORGB_AVX2(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB_AVX2
+      "vzeroupper                                \n"
    "subl        $0x10,%[width]                \n"
    "jg          1b                            \n"
    "vzeroupper                                \n"
-  : [y_buf] "+r"(y_buf),               // %[y_buf]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
-    [u_buf] "+r"(u_buf),               // %[u_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
-    [v_buf] "+r"(v_buf),               // %[v_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
-    [a_buf] "+r"(a_buf),               // %[a_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
-    [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants] "r"(yuvconstants)   // %[yuvconstants]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
-      "xmm4", "xmm5"
+        "xmm4", "xmm5");
  );
 }
 #endif  // HAS_I210TOARGBROW_AVX2
@ -3424,32 +3575,28 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf,
                                    int width) {
  asm volatile(
-    YUVTORGB_SETUP_AVX2(yuvconstants)
+      YUVTORGB_SETUP_AVX2(
-    "sub         %[u_buf],%[v_buf]             \n"
+      yuvconstants) "sub         %[u_buf],%[v_buf]             \n"
-    LABELALIGN
+      LABELALIGN "1:                                        \n" READYUVA410_AVX2
-    "1:                                        \n"
+          YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2
-    READYUVA410_AVX2
+      "subl        $0x10,%[width]                \n"
-    YUVTORGB_AVX2(yuvconstants)
+      "jg          1b                            \n"
-    STOREARGB_AVX2
+      "vzeroupper                                \n"
    "subl        $0x10,%[width]                \n"
    "jg          1b                            \n"
    "vzeroupper                                \n"
-  : [y_buf] "+r"(y_buf),               // %[y_buf]
+      : [y_buf] "+r"(y_buf),        // %[y_buf]
-    [u_buf] "+r"(u_buf),               // %[u_buf]
+        [u_buf] "+r"(u_buf),        // %[u_buf]
-    [v_buf] "+r"(v_buf),               // %[v_buf]
+        [v_buf] "+r"(v_buf),        // %[v_buf]
-    [a_buf] "+r"(a_buf),               // %[a_buf]
+        [a_buf] "+r"(a_buf),        // %[a_buf]
-    [dst_argb] "+r"(dst_argb),         // %[dst_argb]
+        [dst_argb] "+r"(dst_argb),  // %[dst_argb]
 #if defined(__i386__)
-    [width]"+m"(width)     // %[width]
+        [width] "+m"(width)  // %[width]
 #else
-    [width]"+rm"(width)    // %[width]
+        [width] "+rm"(width)  // %[width]
 #endif
-  : [yuvconstants] "r"(yuvconstants)   // %[yuvconstants]
+      : [yuvconstants] "r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
+      : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3",
-    "xmm4", "xmm5"
+        "xmm4", "xmm5");
  );
 }
 #endif  // HAS_I410TOARGBROW_AVX2
@ -3465,23 +3612,23 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "sub         %[u_buf],%[v_buf]             \n"
+      "sub         %[u_buf],%[v_buf]             \n"
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-    "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-    "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-    "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-    "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-    "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READYUV410_AVX2
    YUVTORGB16_AVX2(yuvconstants)
    STOREAR30_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [u_buf]"+r"(u_buf),    // %[u_buf]
    [v_buf]"+r"(v_buf),    // %[v_buf]
@ -3764,16 +3911,16 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf,
  // clang-format off
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP210_AVX2
    YUVTORGB_AVX2(yuvconstants)
    STOREARGB_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@ -3797,16 +3944,16 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf,
  // clang-format off
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP410_AVX2
    YUVTORGB_AVX2(yuvconstants)
    STOREARGB_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
@ -3830,22 +3977,22 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-    "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-    "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-    "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-    "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-    "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP210_AVX2
    YUVTORGB16_AVX2(yuvconstants)
    STOREAR30_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
@ -3868,22 +4015,22 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
  asm volatile (
    YUVTORGB_SETUP_AVX2(yuvconstants)
-    "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
-    "vpsrlw      $14,%%ymm5,%%ymm5             \n"
+      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
-    "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
+      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-    "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
-    "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
-    "vpsrlw      $6,%%ymm7,%%ymm7              \n"
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
    READP410_AVX2
    YUVTORGB16_AVX2(yuvconstants)
    STOREAR30_AVX2
-    "sub         $0x10,%[width]                \n"
+      "sub         $0x10,%[width]                \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : [y_buf]"+r"(y_buf),    // %[y_buf]
    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
@ -4409,33 +4556,33 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
  depth = 16 - depth;
  // clang-format off
  asm volatile (
-    "vmovd       %4,%%xmm3                     \n"
+      "vmovd       %4,%%xmm3                     \n"
-    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-    "vbroadcastss %%xmm3,%%xmm3                \n"
+      "vbroadcastss %%xmm3,%%xmm3                \n"
-    "vbroadcastf128 %5,%%ymm4                  \n"
+      "vbroadcastf128 %5,%%ymm4                  \n"
-    "sub         %1,%2                         \n"
+      "sub         %1,%2                         \n"
    // 16 pixels per loop.
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
-    "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
-    "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
-    "add         $0x40,%0                      \n"
+      "add         $0x40,%0                      \n"
-    "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm3,%%ymm0,%%ymm0          \n"
-    "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm3,%%ymm1,%%ymm1          \n"
-    "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpshufb     %%ymm4,%%ymm0,%%ymm0          \n"
-    "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
+      "vpshufb     %%ymm4,%%ymm1,%%ymm1          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
+      "vpermq      $0xd8,%%ymm1,%%ymm1           \n"
-    "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
-    "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
+      "vextractf128 $0x0,%%ymm1,0x10(%1)         \n"
-    "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
+      "vextractf128 $0x1,%%ymm0,(%1,%2)          \n"
-    "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
+      "vextractf128 $0x1,%%ymm1,0x10(%1,%2)      \n"
-    "add         $0x20,%1                      \n"
+      "add         $0x20,%1                      \n"
-    "sub         $0x10,%3                      \n"
+      "sub         $0x10,%3                      \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : "+r"(src_uv),   // %0
    "+r"(dst_u),   // %1
    "+r"(dst_v),  // %2
@ -4499,24 +4646,24 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                       int width) {
  // clang-format off
  asm volatile (
-    "vmovd       %3,%%xmm3                     \n"
+      "vmovd       %3,%%xmm3                     \n"
-    "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
+      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
-    "sub         %0,%1                         \n"
+      "sub         %0,%1                         \n"
    // 32 pixels per loop.
    LABELALIGN
-    "1:                                        \n"
+      "1:                                        \n"
-    "vmovdqu     (%0),%%ymm0                   \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
-    "vmovdqu     0x20(%0),%%ymm1               \n"
+      "vmovdqu     0x20(%0),%%ymm1               \n"
-    "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm3,%%ymm0,%%ymm0          \n"
-    "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
+      "vpmulhuw    %%ymm3,%%ymm1,%%ymm1          \n"
-    "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
-    "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
+      "vmovdqu     %%ymm1,0x20(%0,%1)            \n"
-    "add         $0x40,%0                      \n"
+      "add         $0x40,%0                      \n"
-    "sub         $0x20,%2                      \n"
+      "sub         $0x20,%2                      \n"
-    "jg          1b                            \n"
+      "jg          1b                            \n"
-    "vzeroupper                                \n"
+      "vzeroupper                                \n"
  : "+r"(src_y),   // %0
    "+r"(dst_y),   // %1
    "+r"(width),    // %2
@ -5173,7 +5320,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
 #if defined(__i386__)
        "+m"(width)  // %5
 #else
-        "+rm"(width)  // %5
+        "+rm"(width)          // %5
 #endif
      : "m"(kShuffleMaskARGBSplit)  // %6
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
@ -5264,7 +5411,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
 #if defined(__i386__)
        "+m"(width)  // %5
 #else
-        "+rm"(width)  // %5
+        "+rm"(width)          // %5
 #endif
      : "m"(kShuffleMaskARGBSplit),   // %6
        "m"(kShuffleMaskARGBPermute)  // %7
@ -7981,7 +8128,7 @@ void HalfFloatRow_AVX2(const uint16_t* src,
 #if defined(__x86_64__)
      : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)            // %3
 #endif
      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
 }
@ -8019,7 +8166,7 @@ void HalfFloatRow_F16C(const uint16_t* src,
 #if defined(__x86_64__)
      : "x"(scale)  // %3
 #else
-      : "m"(scale)    // %3
+      : "m"(scale)            // %3
 #endif
      : "memory", "cc", "xmm2", "xmm3", "xmm4");
 }
--- a/source/scale.cc
+++ b/source/scale.cc
@ -1619,7 +1619,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
                               uint16_t* dst_ptr) {
  void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
                      uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
-  ScaleRowUp2_Bilinear_16_Any_C;
+      ScaleRowUp2_Bilinear_16_Any_C;
  int x;
  // This function can only scale up by 2 times.
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -3167,67 +3167,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 }
 #endif  // HAS_ABGRTOAR30ROW_AVX2
-// TODO(fbarchard): Fix clamping issue affected by U channel.
+// Provide matrix wrappers for 12 bit YUV
-#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,    \
+#define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \
-                         ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \
+  I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
-  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                     \
+#define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \
-    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                         \
+  I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                \
    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                   \
    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                     \
    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);          \
    const int kBpc = 2;                                                     \
    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);             \
    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                     \
    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                     \
    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);            \
    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);          \
    for (int i = 0; i < kWidth * kHeight; ++i) {                            \
      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] =                        \
          (fastrand() & ((1 << S_DEPTH) - 1));                              \
    }                                                                       \
    for (int i = 0; i < kSizeUV; ++i) {                                     \
      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] =                        \
          (fastrand() & ((1 << S_DEPTH) - 1));                              \
      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] =                        \
          (fastrand() & ((1 << S_DEPTH) - 1));                              \
    }                                                                       \
    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                       \
    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                   \
    MaskCpuFlags(disable_cpu_flags_);                                       \
    FMT_PLANAR##To##FMT_B(                                                  \
        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                  \
        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,               \
        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,               \
        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                  \
    MaskCpuFlags(benchmark_cpu_info_);                                      \
    for (int i = 0; i < benchmark_iterations_; ++i) {                       \
      FMT_PLANAR##To##FMT_B(                                                \
          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                \
          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,             \
          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,             \
          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);              \
    }                                                                       \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                    \
      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);              \
    }                                                                       \
    free_aligned_buffer_page_end(src_y);                                    \
    free_aligned_buffer_page_end(src_u);                                    \
    free_aligned_buffer_page_end(src_v);                                    \
    free_aligned_buffer_page_end(dst_argb_c);                               \
    free_aligned_buffer_page_end(dst_argb_opt);                             \
  }
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                        YALIGN, S_DEPTH)                                       \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH)       \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH)     \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH)        \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
                   YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH)
 #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \
  I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j)
@ -3254,43 +3198,105 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 #define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \
  I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j)
-TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 10)
+// TODO(fbarchard): Fix clamping issue affected by U channel.
-TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 10)
+#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
-TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 10)
+                         BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF)     \
-TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 10)
+  TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                       \
-TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1, 10)
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
-TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1, 10)
+    const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                  \
-TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1, 10)
+    const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                     \
-TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1, 10)
+    const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X);                       \
-TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 10)
+    const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y);            \
-TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 10)
+    const int kBpc = 2;                                                       \
-TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 10)
+    align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF);               \
-TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 10)
+    align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF);                       \
-TESTPLANAR16TOB(I410, 1, 1, ARGB, 4, 4, 1, 10)
+    align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF);                       \
-TESTPLANAR16TOB(I410, 1, 1, ABGR, 4, 4, 1, 10)
+    align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF);              \
-TESTPLANAR16TOB(H410, 1, 1, ARGB, 4, 4, 1, 10)
+    align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF);            \
-TESTPLANAR16TOB(H410, 1, 1, ABGR, 4, 4, 1, 10)
+    for (int i = 0; i < kWidth * kHeight; ++i) {                              \
-TESTPLANAR16TOB(U410, 1, 1, ARGB, 4, 4, 1, 10)
+      reinterpret_cast<uint16_t*>(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \
-TESTPLANAR16TOB(U410, 1, 1, ABGR, 4, 4, 1, 10)
+    }                                                                         \
    for (int i = 0; i < kSizeUV; ++i) {                                       \
      reinterpret_cast<uint16_t*>(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \
      reinterpret_cast<uint16_t*>(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \
    }                                                                         \
    memset(dst_argb_c + DOFF, 1, kStrideB * kHeight);                         \
    memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight);                     \
    MaskCpuFlags(disable_cpu_flags_);                                         \
    FMT_PLANAR##To##FMT_B(                                                    \
        reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                    \
        reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,                 \
        reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,                 \
        dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight);                    \
    MaskCpuFlags(benchmark_cpu_info_);                                        \
    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
      FMT_PLANAR##To##FMT_B(                                                  \
          reinterpret_cast<uint16_t*>(src_y + SOFF), kWidth,                  \
          reinterpret_cast<uint16_t*>(src_u + SOFF), kStrideUV,               \
          reinterpret_cast<uint16_t*>(src_v + SOFF), kStrideUV,               \
          dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight);                \
    }                                                                         \
    for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) {                      \
      EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]);                \
    }                                                                         \
    free_aligned_buffer_page_end(src_y);                                      \
    free_aligned_buffer_page_end(src_u);                                      \
    free_aligned_buffer_page_end(src_v);                                      \
    free_aligned_buffer_page_end(dst_argb_c);                                 \
    free_aligned_buffer_page_end(dst_argb_opt);                               \
  }
 #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B,   \
                        BPP_B, ALIGN, YALIGN)                                \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
                   ALIGN, YALIGN, benchmark_width_ - 4, _Any, +, 0, 0)       \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
                   ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 1, 1)     \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
                   ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0)        \
  TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \
                   ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0)
 TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1)
 TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1)
 TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1)
 #ifdef LITTLE_ENDIAN_ONLY_TEST
-TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(I410, 1, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(I410, 1, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(H410, 1, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(H410, 1, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1)
-TESTPLANAR16TOB(U410, 1, 1, AR30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1)
-TESTPLANAR16TOB(U410, 1, 1, AB30, 4, 4, 1, 10)
+TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1)
 TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1)
 #endif
 #define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,      \
--- a/unit_test/scale_argb_test.cc
+++ b/unit_test/scale_argb_test.cc
@ -302,7 +302,7 @@ TEST_FACTOR(3, 1, 3)
  TEST_SCALETO1(name, width, height, Bilinear, 3)
 TEST_SCALETO(ARGBScale, 1, 1)
-TEST_SCALETO(ARGBScale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(ARGBScale, 320, 240)
 TEST_SCALETO(ARGBScale, 569, 480)
 TEST_SCALETO(ARGBScale, 640, 360)
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@ -1025,7 +1025,7 @@ TEST_FACTOR(3, 1, 3, 0)
 #endif
 TEST_SCALETO(Scale, 1, 1)
-TEST_SCALETO(Scale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(Scale, 320, 240)
 TEST_SCALETO(Scale, 569, 480)
 TEST_SCALETO(Scale, 640, 360)
--- a/unit_test/scale_uv_test.cc
+++ b/unit_test/scale_uv_test.cc
@ -166,7 +166,7 @@ TEST_FACTOR(3, 1, 3)
  TEST_SCALETO1(name, width, height, Bilinear, 3)
 TEST_SCALETO(UVScale, 1, 1)
-TEST_SCALETO(UVScale, 256, 144)  /* 128x72 * 2 */
+TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */
 TEST_SCALETO(UVScale, 320, 240)
 TEST_SCALETO(UVScale, 569, 480)
 TEST_SCALETO(UVScale, 640, 360)
--- a/unit_test/unit_test.cc
+++ b/unit_test/unit_test.cc
@ -26,9 +26,13 @@ unsigned int fastrand_seed = 0xfb;
 ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image.");
 ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image.");
 ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test.");
-ABSL_FLAG(int32_t, libyuv_flags, 0,
+ABSL_FLAG(int32_t,
          libyuv_flags,
          0,
          "cpu flags for reference code. 1 = C, -1 = SIMD");
-ABSL_FLAG(int32_t, libyuv_cpu_info, 0,
+ABSL_FLAG(int32_t,
          libyuv_cpu_info,
          0,
          "cpu flags for benchmark code. 1 = C, -1 = SIMD");
 #else
 // Disable command line parameters if absl/flags disabled.