Optimize unlimited data for Intel

Use unsigned coefficient and signed UV value in YUVTORGB. R=fbarchard@chromium.org Bug: libyuv:862, libyuv:863 Change-Id: I32e58b2cee383fb98104c055beb0867a7ad05bfe Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2850016 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-06 08:46:47 +08:00 · 2021-04-27 22:47:36 +08:00 · 2021-04-27 22:47:36 +08:00 · c9843de02a
commit c9843de02a
parent 5e05f26a2b
5 changed files with 300 additions and 335 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -730,25 +730,16 @@ struct YuvConstants {
  uint8_t kUVToB[32];
  uint8_t kUVToG[32];
  uint8_t kUVToR[32];
-  int16_t kUVBiasB[16];
-  int16_t kUVBiasG[16];
-  int16_t kUVBiasR[16];
  int16_t kYToRgb[16];
  int16_t kYBiasToRgb[16];
-  uint8_t kUVMaskBR[32];
 };

 // Offsets into YuvConstants structure
 #define KUVTOB 0
 #define KUVTOG 32
 #define KUVTOR 64
-#define KUVBIASB 96
-#define KUVBIASG 128
-#define KUVBIASR 160
-#define KYTORGB 192
-#define KYBIASTORGB 224
-#define KUMASKB 256
-#define KVMASKR 272
+#define KYTORGB 96
+#define KYBIASTORGB 128

 #endif

--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -55,8 +55,8 @@ static __inline int32_t clamp1023(int32_t v) {
  return (-(v >= 1023) | v) & 1023;
 }

-// clamp to 2^n - 1
-static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
+// clamp to max
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
  return (-(v >= max) | v) & max;
 }

@ -77,7 +77,7 @@ static __inline int32_t clamp1023(int32_t v) {
  return (v > 1023) ? 1023 : v;
 }

-static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
+static __inline int32_t ClampMax(int32_t v, int32_t max) {
  return (v > max) ? max : v;
 }

@ -1422,46 +1422,37 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 // clang-format off

 #if defined(__aarch64__) || defined(__arm__)
-#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR) \
+// Bias values to round, and subtract 128 from U and V.
+// For B and R this is negative. For G this is positive.
+#define BB (UB * 128 - YB)
+#define BG (UG * 128 + VG * 128 + YB)
+#define BR (VR * 128 - YB)
+
+#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR)         \
  {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \
   {YG, BB, BG, BR, YB, 0, 0, 0}}
 #else
-#define UVMASK(C) ((C) > 127 ? 0xff : 0)
-
-#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR)         \
+#define YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                     \
  {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,          \
    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},         \
   {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,  \
    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
   {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,          \
    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},         \
-   {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
-   {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
-   {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
   {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
-   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}, \
-   {0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB),      \
-    0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB), 0, UVMASK(UB),      \
-    0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR),      \
-    0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR), 0, UVMASK(VR)}}
+   {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
 #endif

 // clang-format on

-#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
+#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR)            \
  const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \
-      YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR, BB, BG, BR);        \
+      YUBCONSTANTSBODY(YG, YB, UB, UG, VG, VR);                   \
  const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \
-      YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB, BR, BG, BB);
+      YUBCONSTANTSBODY(YG, YB, VR, VG, UG, UB);

 // TODO(fbarchard): Generate SIMD structures from float matrix.

-// Bias values to round, and subtract 128 from U and V.
-// For B and R this is negative. For G this is positive.
-#define BB (UB * 128 - YB)
-#define BG (UG * 128 + VG * 128 + YB)
-#define BR (VR * 128 - YB)
-
 // BT.601 limited range YUV to RGB reference
 //  R = (Y - 16) * 1.164             + V * 1.596
 //  G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
@ -1482,7 +1473,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1507,7 +1498,7 @@ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */

-MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1536,7 +1527,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1561,7 +1552,7 @@ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */

-MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1590,7 +1581,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
 #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */

-MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1614,7 +1605,7 @@ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
 #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
 #define YB 32    /* 64 / 2 */

-MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
+MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)

 #undef YG
 #undef YB
@ -1631,24 +1622,38 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)

 #if defined(__aarch64__) || defined(__arm__)
 #define LOAD_YUV_CONSTANTS                 \
-  int ub = -yuvconstants->kUVCoeff[0];      \
-  int vr = -yuvconstants->kUVCoeff[1];      \
+  int ub = yuvconstants->kUVCoeff[0];      \
+  int vr = yuvconstants->kUVCoeff[1];      \
  int ug = yuvconstants->kUVCoeff[2];      \
  int vg = yuvconstants->kUVCoeff[3];      \
  int yg = yuvconstants->kRGBCoeffBias[0]; \
-  int bb = -yuvconstants->kRGBCoeffBias[1]; \
+  int bb = yuvconstants->kRGBCoeffBias[1]; \
  int bg = yuvconstants->kRGBCoeffBias[2]; \
-  int br = -yuvconstants->kRGBCoeffBias[3]
+  int br = yuvconstants->kRGBCoeffBias[3]
+
+#define CALC_RGB16                         \
+  int32_t y1 = (uint32_t)(y32 * yg) >> 16; \
+  int b16 = y1 + (u * ub) - bb;            \
+  int g16 = y1 + bg - (u * ug + v * vg);   \
+  int r16 = y1 + (v * vr) - br
 #else
 #define LOAD_YUV_CONSTANTS            \
-  int ub = -yuvconstants->kUVToB[0];   \
+  int ub = yuvconstants->kUVToB[0];   \
  int ug = yuvconstants->kUVToG[0];   \
  int vg = yuvconstants->kUVToG[1];   \
-  int vr = -yuvconstants->kUVToR[1];   \
-  int bb = -yuvconstants->kUVBiasB[0]; \
-  int bg = yuvconstants->kUVBiasG[0];  \
-  int br = -yuvconstants->kUVBiasR[0]; \
-  int yg = yuvconstants->kYToRgb[0]
+  int vr = yuvconstants->kUVToR[1];   \
+  int yg = yuvconstants->kYToRgb[0];  \
+  int yb = yuvconstants->kYBiasToRgb[0]
+
+#define CALC_RGB16                                \
+  int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \
+  int8_t ui = u;                                  \
+  int8_t vi = v;                                  \
+  ui -= 0x80;                                     \
+  vi -= 0x80;                                     \
+  int b16 = y1 + (ui * ub);                       \
+  int g16 = y1 - (ui * ug + vi * vg);             \
+  int r16 = y1 + (vi * vr)
 #endif

 // C reference code that mimics the YUV assembly.
@ -1661,11 +1666,11 @@ static __inline void YuvPixel(uint8_t y,
                              uint8_t* r,
                              const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32_t)(y1 - (u * ub) + bb) >> 6);
-  *g = Clamp((int32_t)(y1 - (u * ug + v * vg) + bg) >> 6);
-  *r = Clamp((int32_t)(y1 - (v * vr) + br) >> 6);
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
 }

 // Reads 8 bit YUV and leaves result as 16 bit.
@ -1677,11 +1682,11 @@ static __inline void YuvPixel8_16(uint8_t y,
                                  int* r,
                                  const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
-  *b = (int)(y1 - (u * ub) + bb);
-  *g = (int)(y1 - (u * ug + v * vg) + bg);
-  *r = (int)(y1 - (v * vr) + br);
+  uint32_t y32 = y * 0x0101;
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }

 // C reference code that mimics the YUV 16 bit assembly.
@ -1694,13 +1699,13 @@ static __inline void YuvPixel10_16(uint16_t y,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+  uint32_t y32 = y << 6;
  u = clamp255(u >> 2);
  v = clamp255(v >> 2);
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }

 // C reference code that mimics the YUV 16 bit assembly.
@ -1713,13 +1718,13 @@ static __inline void YuvPixel12_16(int16_t y,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)((y << 4) * yg) >> 16;
+  uint32_t y32 = y << 4;
  u = clamp255(u >> 4);
  v = clamp255(v >> 4);
-  *b = (int)(-(u * ub) + y1 + bb);
-  *g = (int)(-(u * ug + v * vg) + y1 + bg);
-  *r = (int)(-(v * vr) + y1 + br);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }

 // C reference code that mimics the YUV 10 bit assembly.
@ -1768,13 +1773,13 @@ static __inline void YuvPixel16_8(uint16_t y,
                                  uint8_t* r,
                                  const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)(y * yg) >> 16;
+  uint32_t y32 = y;
  u = clamp255(u >> 8);
  v = clamp255(v >> 8);
-  *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6);
-  *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6);
-  *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6);
+  CALC_RGB16;
+  *b = Clamp((int32_t)(b16) >> 6);
+  *g = Clamp((int32_t)(g16) >> 6);
+  *r = Clamp((int32_t)(r16) >> 6);
 }

 // C reference code that mimics the YUV 16 bit assembly.
@ -1787,13 +1792,13 @@ static __inline void YuvPixel16_16(uint16_t y,
                                   int* r,
                                   const struct YuvConstants* yuvconstants) {
  LOAD_YUV_CONSTANTS;
-
-  uint32_t y1 = (uint32_t)(y * yg) >> 16;
+  uint32_t y32 = y;
  u = clamp255(u >> 8);
  v = clamp255(v >> 8);
-  *b = (int)(y1 + -(u * ub) + bb);
-  *g = (int)(y1 + -(u * ug + v * vg) + bg);
-  *r = (int)(y1 + -(v * vr) + br);
+  CALC_RGB16;
+  *b = b16;
+  *g = g16;
+  *r = r16;
 }

 // C reference code that mimics the YUV assembly.
@ -2779,10 +2784,10 @@ void MergeAR64Row_C(const uint16_t* src_r,
  int shift = 16 - depth;
  int max = (1 << depth) - 1;
  for (x = 0; x < width; ++x) {
-    dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
-    dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
-    dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
-    dst_ar64[3] = clamp2nm1(src_a[x], max) << shift;
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
+    dst_ar64[3] = ClampMax(src_a[x], max) << shift;
    dst_ar64 += 4;
  }
 }
@ -2819,9 +2824,9 @@ void MergeXR64Row_C(const uint16_t* src_r,
  int shift = 16 - depth;
  int max = (1 << depth) - 1;
  for (x = 0; x < width; ++x) {
-    dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
-    dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
-    dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
+    dst_ar64[0] = ClampMax(src_b[x], max) << shift;
+    dst_ar64[1] = ClampMax(src_g[x], max) << shift;
+    dst_ar64[2] = ClampMax(src_r[x], max) << shift;
    dst_ar64[3] = 0xffff;
    dst_ar64 += 4;
  }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -2312,78 +2312,65 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,

 #if defined(__x86_64__)
 #define YUVTORGB_SETUP(yuvconstants)                              \
+  "pcmpeqb    %%xmm13,%%xmm13                                 \n" \
  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "pxor       %%xmm12,%%xmm12                                 \n" \
  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "psllw      $7,%%xmm13                                      \n" \
  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "pshufb     %%xmm12,%%xmm13                                 \n" \
  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
-  "movdqa     192(%[yuvconstants]),%%xmm14                    \n" \
-  "movdqa     256(%[yuvconstants]),%%xmm15                    \n" \
-  "movdqa     272(%[yuvconstants]),%%xmm7                     \n"
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n"

 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm3,%%xmm0                                   \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "movdqa     %%xmm3,%%xmm2                                   \n" \
-  "pmaddubsw  %%xmm8,%%xmm0                                   \n" \
-  "pmaddubsw  %%xmm10,%%xmm2                                  \n" \
-  "psllw      $8,%%xmm1                                       \n" \
-  "pand       %%xmm15,%%xmm1                                  \n" \
-  "paddw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "pmaddubsw  %%xmm9,%%xmm1                                   \n" \
-  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
-  "pand       %%xmm7,%%xmm3                                   \n" \
-  "paddw      %%xmm3,%%xmm2                                   \n" \
-  "paddw      %%xmm4,%%xmm0                                   \n" \
-  "paddw      %%xmm4,%%xmm2                                   \n" \
+  "psubb      %%xmm13,%%xmm3                                  \n" \
+  "pmulhuw    %%xmm11,%%xmm4                                  \n" \
+  "movdqa     %%xmm8,%%xmm0                                   \n" \
+  "movdqa     %%xmm9,%%xmm1                                   \n" \
+  "movdqa     %%xmm10,%%xmm2                                  \n" \
  "paddw      %%xmm12,%%xmm4                                  \n" \
-  "psubusw    %%xmm11,%%xmm0                                  \n" \
-  "psubusw    %%xmm1,%%xmm4                                   \n" \
-  "psubusw    %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
  "movdqa     %%xmm4,%%xmm1                                   \n"

-#define YUVTORGB_REGS \
-  "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+#define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",

 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB16(yuvconstants)                                  \
-  "movdqa     %%xmm3,%%xmm0                                   \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "movdqa     %%xmm3,%%xmm2                                   \n" \
-  "pmaddubsw  (%[yuvconstants]),%%xmm0                        \n" \
-  "pmaddubsw  64(%[yuvconstants]),%%xmm2                      \n" \
-  "psllw      $8,%%xmm1                                       \n" \
-  "pand       256(%[yuvconstants]),%%xmm1                     \n" \
-  "paddw      %%xmm1,%%xmm0                                   \n" \
-  "movdqa     %%xmm3,%%xmm1                                   \n" \
-  "pmaddubsw  32(%[yuvconstants]),%%xmm1                      \n" \
-  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
-  "pand       272(%[yuvconstants]),%%xmm3                     \n" \
-  "paddw      %%xmm3,%%xmm2                                   \n" \
-  "movdqa     128(%[yuvconstants]),%%xmm7                     \n" \
-  "paddw      %%xmm4,%%xmm0                                   \n" \
-  "paddw      %%xmm4,%%xmm2                                   \n" \
-  "paddw      %%xmm7,%%xmm4                                   \n" \
-  "movdqa     96(%[yuvconstants]),%%xmm7                      \n" \
-  "psubusw    %%xmm7,%%xmm0                                   \n" \
-  "psubusw    %%xmm1,%%xmm4                                   \n" \
-  "movdqa     160(%[yuvconstants]),%%xmm7                     \n" \
-  "psubusw    %%xmm7,%%xmm2                                   \n" \
-  "movdqa     %%xmm4,%%xmm1                                   \n" \
+  "pcmpeqb    %%xmm0,%%xmm0                                   \n" \
+  "pxor       %%xmm1,%%xmm1                                   \n" \
+  "psllw      $7,%%xmm0                                       \n" \
+  "pshufb     %%xmm1,%%xmm0                                   \n" \
+  "psubb      %%xmm0,%%xmm3                                   \n" \
+  "pmulhuw    96(%[yuvconstants]),%%xmm4                      \n" \
+  "movdqa     (%[yuvconstants]),%%xmm0                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm1                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm2                      \n" \
+  "pmaddubsw  %%xmm3,%%xmm0                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm1                                   \n" \
+  "pmaddubsw  %%xmm3,%%xmm2                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm3                     \n" \
+  "paddw      %%xmm3,%%xmm4                                   \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n" \
+  "psubsw     %%xmm1,%%xmm4                                   \n" \
+  "movdqa     %%xmm4,%%xmm1                                   \n"

-#define YUVTORGB_REGS "xmm7",
+#define YUVTORGB_REGS
 #endif

 #define YUVTORGB(yuvconstants)                                    \
  YUVTORGB16(yuvconstants)                                        \
-  "psrlw      $0x6,%%xmm0                                     \n" \
-  "psrlw      $0x6,%%xmm1                                     \n" \
-  "psrlw      $0x6,%%xmm2                                     \n" \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
  "packuswb   %%xmm0,%%xmm0                                   \n" \
  "packuswb   %%xmm1,%%xmm1                                   \n" \
  "packuswb   %%xmm2,%%xmm2                                   \n"
@ -2416,9 +2403,12 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
  "psraw      $0x4,%%xmm0                                      \n" \
  "psraw      $0x4,%%xmm1                                      \n" \
  "psraw      $0x4,%%xmm2                                      \n" \
-  "pminuw     %%xmm6,%%xmm0                                    \n" \
-  "pminuw     %%xmm6,%%xmm1                                    \n" \
-  "pminuw     %%xmm6,%%xmm2                                    \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
  "psllw      $0x4,%%xmm2                                      \n" \
  "movdqa     %%xmm0,%%xmm3                                    \n" \
  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
@ -2588,8 +2578,9 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"  // AR30 constants
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -2605,7 +2596,7 @@ void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -2682,8 +2673,9 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -2699,7 +2691,7 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -2716,8 +2708,9 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -2733,7 +2726,7 @@ void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -2850,8 +2843,9 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -2867,7 +2861,7 @@ void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)   // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -3076,8 +3070,9 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -3092,7 +3087,7 @@ void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf,
    [width]"+rm"(width)              // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -3106,8 +3101,9 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrlw       $14,%%xmm5                    \n"
      "psllw       $4,%%xmm5                     \n"  // 2 alpha bits
-      "pcmpeqb     %%xmm6,%%xmm6                 \n"
-      "psrlw       $6,%%xmm6                     \n"  // 1023 for max
+      "pxor        %%xmm6,%%xmm6                 \n"  // 0 for min
+      "pcmpeqb     %%xmm7,%%xmm7                 \n"
+      "psrlw       $6,%%xmm7                     \n"  // 1023 for max

    LABELALIGN
      "1:                                        \n"
@ -3122,7 +3118,7 @@ void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf,
    [width]"+rm"(width)              // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }

@ -3360,70 +3356,58 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,

 #if defined(__x86_64__)
 #define YUVTORGB_SETUP_AVX2(yuvconstants)                             \
+  "vpcmpeqb    %%xmm13,%%xmm13,%%xmm13                            \n" \
  "vmovdqa     (%[yuvconstants]),%%ymm8                           \n" \
+  "vpsllw      $7,%%xmm13,%%xmm13                                 \n" \
  "vmovdqa     32(%[yuvconstants]),%%ymm9                         \n" \
+  "vpbroadcastb %%xmm13,%%ymm13                                   \n" \
  "vmovdqa     64(%[yuvconstants]),%%ymm10                        \n" \
  "vmovdqa     96(%[yuvconstants]),%%ymm11                        \n" \
-  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n" \
-  "vmovdqa     160(%[yuvconstants]),%%ymm13                       \n" \
-  "vmovdqa     192(%[yuvconstants]),%%ymm14                       \n" \
-  "vbroadcastf128 256(%[yuvconstants]),%%ymm15                    \n" \
-  "vbroadcastf128 272(%[yuvconstants]),%%ymm7                     \n"
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                       \n"

-// TODO(yuan): Consider signed UV and unsigned coefficient for vpmaddubsw.
 #define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  %%ymm8,%%ymm3,%%ymm0                               \n" \
-  "vpmaddubsw  %%ymm10,%%ymm3,%%ymm2                              \n" \
-  "vpsllw      $8,%%ymm3,%%ymm1                                   \n" \
-  "vpand       %%ymm1,%%ymm15,%%ymm1                              \n" \
-  "vpaddw      %%ymm1,%%ymm0,%%ymm0                               \n" \
-  "vpmaddubsw  %%ymm9,%%ymm3,%%ymm1                               \n" \
-  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
-  "vpand       %%ymm3,%%ymm7,%%ymm3                               \n" \
-  "vpaddw      %%ymm3,%%ymm2,%%ymm2                               \n" \
-  "vpaddw      %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vpaddw      %%ymm4,%%ymm12,%%ymm3                              \n" \
-  "vpaddw      %%ymm4,%%ymm2,%%ymm2                               \n" \
-  "vpsubusw    %%ymm11,%%ymm0,%%ymm0                              \n" \
-  "vpsubusw    %%ymm1,%%ymm3,%%ymm1                               \n" \
-  "vpsubusw    %%ymm13,%%ymm2,%%ymm2                              \n"
+  "vpsubb      %%ymm13,%%ymm3,%%ymm3                              \n" \
+  "vpmulhuw    %%ymm11,%%ymm4,%%ymm4                              \n" \
+  "vpmaddubsw  %%ymm3,%%ymm8,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm9,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm10,%%ymm2                              \n" \
+  "vpaddw      %%ymm4,%%ymm12,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"

 #define YUVTORGB_REGS_AVX2 \
-  "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",

 #else  // Convert 16 pixels: 16 UV and 16 Y.

 #define YUVTORGB_SETUP_AVX2(yuvconstants)
 #define YUVTORGB16_AVX2(yuvconstants)                                 \
-  "vpmaddubsw  (%[yuvconstants]),%%ymm3,%%ymm0                    \n" \
-  "vpmaddubsw  64(%[yuvconstants]),%%ymm3,%%ymm2                  \n" \
-  "vpsllw      $8,%%ymm3,%%ymm1                                   \n" \
-  "vbroadcastf128 256(%[yuvconstants]),%%ymm7                     \n" \
-  "vpand       %%ymm7,%%ymm1,%%ymm1                               \n" \
-  "vpaddw      %%ymm1,%%ymm0,%%ymm0                               \n" \
-  "vpmaddubsw  32(%[yuvconstants]),%%ymm3,%%ymm1                  \n" \
-  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
-  "vbroadcastf128 272(%[yuvconstants]),%%ymm7                     \n" \
-  "vpand       %%ymm7,%%ymm3,%%ymm3                               \n" \
-  "vpaddw      %%ymm3,%%ymm2,%%ymm2                               \n" \
-  "vpaddw      %%ymm4,%%ymm0,%%ymm0                               \n" \
-  "vmovdqu     128(%[yuvconstants]),%%ymm7                        \n" \
-  "vpaddw      %%ymm4,%%ymm7,%%ymm3                               \n" \
-  "vpaddw      %%ymm4,%%ymm2,%%ymm2                               \n" \
-  "vmovdqu     96(%[yuvconstants]),%%ymm7                         \n" \
-  "vpsubusw    %%ymm7,%%ymm0,%%ymm0                               \n" \
-  "vpsubusw    %%ymm1,%%ymm3,%%ymm1                               \n" \
-  "vmovdqu     160(%[yuvconstants]),%%ymm7                        \n" \
-  "vpsubusw    %%ymm7,%%ymm2,%%ymm2                               \n"
+  "vpcmpeqb    %%xmm0,%%xmm0,%%xmm0                               \n" \
+  "vpsllw      $7,%%xmm0,%%xmm0                                   \n" \
+  "vpbroadcastb %%xmm0,%%ymm0                                     \n" \
+  "vpsubb      %%ymm0,%%ymm3,%%ymm3                               \n" \
+  "vpmulhuw    96(%[yuvconstants]),%%ymm4,%%ymm4                  \n" \
+  "vmovdqa     (%[yuvconstants]),%%ymm0                           \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm1                         \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm2                         \n" \
+  "vpmaddubsw  %%ymm3,%%ymm0,%%ymm0                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm1,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm3,%%ymm2,%%ymm2                               \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpaddw      %%ymm4,%%ymm3,%%ymm4                               \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpsubsw     %%ymm1,%%ymm4,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"

-#define YUVTORGB_REGS_AVX2 "xmm7",
+#define YUVTORGB_REGS_AVX2
 #endif

 #define YUVTORGB_AVX2(yuvconstants)                                   \
  YUVTORGB16_AVX2(yuvconstants)                                       \
-  "vpsrlw      $0x6,%%ymm0,%%ymm0                                 \n" \
-  "vpsrlw      $0x6,%%ymm1,%%ymm1                                 \n" \
-  "vpsrlw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
@ -3442,12 +3426,15 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,

 // Store 16 AR30 values.
 #define STOREAR30_AVX2                                                \
-  "vpsrlw     $0x4,%%ymm0,%%ymm0                                  \n" \
-  "vpsrlw     $0x4,%%ymm1,%%ymm1                                  \n" \
-  "vpsrlw     $0x4,%%ymm2,%%ymm2                                  \n" \
-  "vpminuw    %%ymm6,%%ymm0,%%ymm0                                \n" \
-  "vpminuw    %%ymm6,%%ymm1,%%ymm1                                \n" \
-  "vpminuw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
@ -3548,8 +3535,9 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -3567,7 +3555,7 @@ void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #endif  // HAS_I422TOAR30ROW_AVX2
@ -3657,8 +3645,9 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -3676,7 +3665,7 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #endif  // HAS_I210TOAR30ROW_AVX2
@ -3696,8 +3685,9 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -3715,7 +3705,7 @@ void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #endif  // HAS_I212TOAR30ROW_AVX2
@ -3842,8 +3832,9 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -3861,7 +3852,7 @@ void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf,
    [width]"+rm"(width)    // %[width]
  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
  : "memory", "cc", YUVTORGB_REGS_AVX2
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
  );
 }
 #endif  // HAS_I410TOAR30ROW_AVX2
@ -4204,8 +4195,9 @@ void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -4240,8 +4232,9 @@ void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf,
      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"  // AR30 constants
      "vpsrlw      $14,%%ymm5,%%ymm5             \n"
      "vpsllw      $4,%%ymm5,%%ymm5              \n"  // 2 alpha bits
-      "vpcmpeqb    %%ymm6,%%ymm6,%%ymm6          \n"  // 1023 for max
-      "vpsrlw      $6,%%ymm6,%%ymm6              \n"
+      "vpxor       %%ymm6,%%ymm6,%%ymm6          \n"  // 0 for min
+      "vpcmpeqb    %%ymm7,%%ymm7,%%ymm7          \n"  // 1023 for max
+      "vpsrlw      $6,%%ymm7,%%ymm7              \n"

    LABELALIGN
      "1:                                        \n"
@ -4269,8 +4262,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
-      "movdqa      192(%3),%%xmm2                \n"  // yg = 18997 = 1.164
-      "movdqa      224(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
+      "movdqa      96(%3),%%xmm2                 \n"  // yg = 18997 = 1.164
+      "movdqa      128(%3),%%xmm3                \n"  // ygb = 1160 = 1.164 * 16
      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 0xff000000
      "pslld       $0x18,%%xmm4                  \n"

@ -4314,8 +4307,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf,
                        const struct YuvConstants* yuvconstants,
                        int width) {
  asm volatile(
-      "vmovdqa     192(%3),%%ymm2                \n"  // yg = 18997 = 1.164
-      "vmovdqa     224(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
+      "vmovdqa     96(%3),%%ymm2                 \n"  // yg = 18997 = 1.164
+      "vmovdqa     128(%3),%%ymm3                \n"  // ygb = -1160 = 1.164*16
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 0xff000000
      "vpslld      $0x18,%%ymm4,%%ymm4           \n"

--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -75,28 +75,18 @@ extern "C" {

 // Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB(yuvconstants)                                            \
-  xmm0 = _mm_loadu_si128(&xmm3);                                          \
-  xmm1 = _mm_loadu_si128(&xmm3);                                          \
-  xmm2 = _mm_loadu_si128(&xmm3);                                          \
-  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);        \
-  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);        \
-  xmm1 = _mm_slli_epi16(xmm1, 8);                                         \
-  xmm1 = _mm_and_si128(xmm1, *(__m128i*)yuvconstants->kUVMaskBR);         \
-  xmm0 = _mm_add_epi16(xmm0, xmm1);                                       \
-  xmm1 = _mm_loadu_si128(&xmm3);                                          \
-  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);        \
+  xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80));                         \
  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);         \
-  xmm3 = _mm_and_si128(xmm3, *((__m128i*)(yuvconstants->kUVMaskBR) + 1)); \
-  xmm2 = _mm_add_epi16(xmm2, xmm3);                                       \
-  xmm0 = _mm_add_epi16(xmm0, xmm4);                                       \
-  xmm2 = _mm_add_epi16(xmm2, xmm4);                                       \
-  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kUVBiasG);          \
-  xmm0 = _mm_subs_epu16(xmm0, *(__m128i*)yuvconstants->kUVBiasB);         \
-  xmm1 = _mm_subs_epu16(xmm4, xmm1);                                      \
-  xmm2 = _mm_subs_epu16(xmm2, *(__m128i*)yuvconstants->kUVBiasR);         \
-  xmm0 = _mm_srli_epi16(xmm0, 6);                                         \
-  xmm1 = _mm_srli_epi16(xmm1, 6);                                         \
-  xmm2 = _mm_srli_epi16(xmm2, 6);                                         \
+  xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb);       \
+  xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3);        \
+  xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3);        \
+  xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3);        \
+  xmm0 = _mm_adds_epi16(xmm4, xmm0);                                      \
+  xmm1 = _mm_subs_epi16(xmm4, xmm1);                                      \
+  xmm2 = _mm_adds_epi16(xmm4, xmm2);                                      \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                         \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                         \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                         \
  xmm0 = _mm_packus_epi16(xmm0, xmm0);                                    \
  xmm1 = _mm_packus_epi16(xmm1, xmm1);                                    \
  xmm2 = _mm_packus_epi16(xmm2, xmm2);
@ -254,11 +244,11 @@ static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
 // 7 bit fixed point 0.5.
 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};

-static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
-
-static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+// 8 bit fixed point 0.5, for bias of UV.
+static const ulvec8 kBiasUV128 = {
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};

 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
@ -1447,7 +1437,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kARGBToV
    movdqa     xmm7, xmmword ptr kARGBToU
    sub        edi, edx  // stride from u to v
@ -1519,7 +1509,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUVJ128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kARGBToVJ
    movdqa     xmm7, xmmword ptr kARGBToUJ
    sub        edi, edx  // stride from u to v
@ -1593,7 +1583,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
    vbroadcastf128 ymm6, xmmword ptr kARGBToV
    vbroadcastf128 ymm7, xmmword ptr kARGBToU
    sub        edi, edx   // stride from u to v
@ -1661,7 +1651,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+    vbroadcastf128 ymm5, xmmword ptr kBiasUV128
    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
    sub        edi, edx   // stride from u to v
@ -1726,7 +1716,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 4 + 8]  // dst_u
    mov        edi, [esp + 4 + 12]  // dst_v
    mov        ecx, [esp + 4 + 16]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kARGBToV
    movdqa     xmm7, xmmword ptr kARGBToU
    sub        edi, edx    // stride from u to v
@ -1787,7 +1777,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kBGRAToV
    movdqa     xmm7, xmmword ptr kBGRAToU
    sub        edi, edx  // stride from u to v
@ -1859,7 +1849,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kABGRToV
    movdqa     xmm7, xmmword ptr kABGRToU
    sub        edi, edx  // stride from u to v
@ -1931,7 +1921,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // width
-    movdqa     xmm5, xmmword ptr kAddUV128
+    movdqa     xmm5, xmmword ptr kBiasUV128
    movdqa     xmm6, xmmword ptr kRGBAToV
    movdqa     xmm7, xmmword ptr kRGBAToU
    sub        edi, edx  // stride from u to v
@ -2098,32 +2088,25 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
 // Convert 16 pixels: 16 UV and 16 Y.
 #define YUVTORGB_AVX2(YuvConstants) \
  __asm {                                                                      \
-    __asm vpmaddubsw ymm0, ymm3, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
-    __asm vpmaddubsw ymm2, ymm3, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
-    __asm vpsllw     ymm1, ymm3, 8                                             \
-    __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KUMASKB]            \
-    __asm vpand      ymm1, ymm1, ymm6                                          \
-    __asm vpaddw     ymm0, ymm0, ymm1                                          \
-    __asm vpmaddubsw ymm1, ymm3, ymmword ptr [YuvConstants + KUVTOG] /* B UV */\
+    __asm vpsubb     ymm3, ymm3, ymmword ptr kBiasUV128                        \
    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vbroadcastf128 ymm6, xmmword ptr [YuvConstants + KVMASKR]            \
-    __asm vpand      ymm3, ymm3, ymm6                                          \
-    __asm vpaddw     ymm2, ymm2, ymm3                                          \
-    __asm vpaddw     ymm0, ymm0, ymm4                                          \
-    __asm vmovdqu    ymm6, ymmword ptr [YuvConstants + KUVBIASG]               \
-    __asm vpaddw     ymm3, ymm4, ymm6                                          \
-    __asm vpaddw     ymm2, ymm2, ymm4                                          \
-    __asm vmovdqu    ymm6, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubusw   ymm0, ymm0, ymm6                                          \
-    __asm vpsubusw   ymm1, ymm3, ymm1                                          \
-    __asm vmovdqu    ymm6, ymmword ptr [YuvConstants + KUVBIASR]               \
-    __asm vpsubusw   ymm2, ymm2, ymm6                                          \
-    __asm vpsrlw     ymm0, ymm0, 6                                             \
-    __asm vpsrlw     ymm1, ymm1, 6                                             \
-    __asm vpsrlw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                                  \
-    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                                  \
-    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                                  \
+    __asm vmovdqa    ymm0, ymmword ptr [YuvConstants + KUVTOB]                 \
+    __asm vmovdqa    ymm1, ymmword ptr [YuvConstants + KUVTOG]                 \
+    __asm vmovdqa    ymm2, ymmword ptr [YuvConstants + KUVTOR]                 \
+    __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */                               \
+    __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */                               \
+    __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */                               \
+    __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm vpaddw     ymm4, ymm3, ymm4                                          \
+    __asm vpaddsw    ymm0, ymm0, ymm4                                          \
+    __asm vpsubsw    ymm1, ymm4, ymm1                                          \
+    __asm vpaddsw    ymm2, ymm2, ymm4                                          \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0                                          \
+    __asm vpackuswb  ymm1, ymm1, ymm1                                          \
+    __asm vpackuswb  ymm2, ymm2, ymm2                                          \
  }

 // Store 16 ARGB values.
@ -2583,30 +2566,23 @@ __declspec(naked) void I422ToRGBARow_AVX2(
 // Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB(YuvConstants) \
  __asm {                                                                      \
-    __asm movdqa     xmm0, xmm3                                                \
-    __asm movdqa     xmm1, xmm3                                                \
-    __asm movdqa     xmm2, xmm3                                                \
-    __asm pmaddubsw  xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
-    __asm pmaddubsw  xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
-    __asm psllw      xmm1, 8                                                   \
-    __asm pand       xmm1, xmmword ptr [YuvConstants + KUMASKB]                \
-    __asm paddw      xmm0, xmm1                                                \
-    __asm pmaddubsw  xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm psubb      xmm3, xmmword ptr kBiasUV128                              \
    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm pand       xmm3, xmmword ptr [YuvConstants + KVMASKR]                \
-    __asm paddw      xmm0, xmm4                                                \
-    __asm movdqa     xmm6, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm paddw      xmm2, xmm4                                                \
-    __asm paddw      xmm4, xmm6                                                \
-    __asm movdqa     xmm6, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm psubusw    xmm0, xmm6                                                \
-    __asm psubusw    xmm4, xmm1                                                \
-    __asm movdqa     xmm6, xmmword ptr [YuvConstants + KUVBIASG]               \
-    __asm psubusw    xmm2, xmm6                                                \
+    __asm movdqa     xmm0, xmmword ptr [YuvConstants + KUVTOB]                 \
+    __asm movdqa     xmm1, xmmword ptr [YuvConstants + KUVTOG]                 \
+    __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVTOR]                 \
+    __asm pmaddubsw  xmm0, xmm3                                                \
+    __asm pmaddubsw  xmm1, xmm3                                                \
+    __asm pmaddubsw  xmm2, xmm3                                                \
+    __asm movdqa     xmm3, xmmword ptr [YuvConstants + KYBIASTORGB]            \
+    __asm paddw      xmm4, xmm3                                                \
+    __asm paddsw     xmm0, xmm4                                                \
+    __asm paddsw     xmm2, xmm4                                                \
+    __asm psubsw     xmm4, xmm1                                                \
    __asm movdqa     xmm1, xmm4                                                \
-    __asm psrlw      xmm0, 6                                                   \
-    __asm psrlw      xmm1, 6                                                   \
-    __asm psrlw      xmm2, 6                                                   \
+    __asm psraw      xmm0, 6                                                   \
+    __asm psraw      xmm1, 6                                                   \
+    __asm psraw      xmm2, 6                                                   \
    __asm packuswb   xmm0, xmm0 /* B */                                        \
    __asm packuswb   xmm1, xmm1 /* G */                                        \
    __asm packuswb   xmm2, xmm2 /* R */                                        \
--- a/unit_test/unit_test.h
+++ b/unit_test/unit_test.h
@ -11,7 +11,7 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_

-#ifdef WIN32
+#ifdef _WIN32
 #include <windows.h>
 #else
 #include <sys/resource.h>