From 09db0c4ce2008008f73b247f1a5b64cfbb29b72e Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@chromium.org>
Date: Fri, 19 Jan 2018 10:22:08 -0800
Subject: [PATCH] H010ToAR30 in 1 step with SSSE3 assembly

Switch YUV conversion macro to output 16 bits per channel.
STOREAR30 macro to output AR30.

[ RUN      ] LibYUVConvertTest.TestH420ToARGB
uniques: B 220, G, 220, R 220
[       OK ] LibYUVConvertTest.TestH420ToARGB (0 ms)
[ RUN      ] LibYUVConvertTest.TestH010ToARGB
uniques: B 256, G, 256, R 256
[       OK ] LibYUVConvertTest.TestH010ToARGB (0 ms)
[ RUN      ] LibYUVConvertTest.TestH010ToAR30
uniques: B 883, G, 883, R 883
[       OK ] LibYUVConvertTest.TestH010ToAR30 (0 ms)

Bug: libyuv:751
Test: LibYUVConvertTest.H010ToAR30_Opt
Change-Id: I902b718e2c8b68ede69625ccafebc6519d5af70d
Reviewed-on: https://chromium-review.googlesource.com/869511
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Miguel Casas <mcasas@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 README.chromium               |   2 +-
 include/libyuv/convert_argb.h |  13 ++++
 include/libyuv/row.h          |  19 +++++
 include/libyuv/version.h      |   2 +-
 source/convert_argb.cc        |  79 +++++++++------------
 source/row_any.cc             |   3 +
 source/row_common.cc          | 112 ++++++++++++++++++++++++++++--
 source/row_gcc.cc             | 126 +++++++++++++++++++++++++---------
 8 files changed, 269 insertions(+), 87 deletions(-)

diff --git a/README.chromium b/README.chromium
index 77ab37e82..8d6e615a0 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1689
+Version: 1690
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h
index b8b57cb12..973c615b9 100644
--- a/include/libyuv/convert_argb.h
+++ b/include/libyuv/convert_argb.h
@@ -420,6 +420,19 @@ int H010ToARGB(const uint16* src_y,
                int width,
                int height);
 
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
 // Convert H010 to AR30.
 LIBYUV_API
 int H010ToAR30(const uint16* src_y,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 28ecc6726..08b3465e6 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -256,6 +256,7 @@ extern "C" {
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 // I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
@@ -1682,6 +1683,12 @@ void I422ToARGBRow_C(const uint8* src_y,
                      uint8* dst_argb,
                      const struct YuvConstants* yuvconstants,
                      int width);
+void I210ToAR30Row_C(const uint16* src_y,
+                     const uint16* src_u,
+                     const uint16* src_v,
+                     uint8* dst_ar30,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
 void I210ToARGBRow_C(const uint16* src_y,
                      const uint16* src_u,
                      const uint16* src_v,
@@ -1791,6 +1798,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
                          const struct YuvConstants* yuvconstants,
                          int width);
 
+void I210ToAR30Row_SSSE3(const uint16* src_y,
+                         const uint16* src_u,
+                         const uint16* src_v,
+                         uint8* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
 void I210ToARGBRow_SSSE3(const uint16* src_y,
                          const uint16* src_u,
                          const uint16* src_v,
@@ -1947,6 +1960,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
                              uint8* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void I210ToAR30Row_Any_SSSE3(const uint16* src_y,
+                             const uint16* src_u,
+                             const uint16* src_v,
+                             uint8* dst_ar30,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
 void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
                              const uint16* src_u,
                              const uint16* src_v,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index b191add97..dc0bf6f0b 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1689
+#define LIBYUV_VERSION 1690
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index 292010831..20cac2de2 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -413,7 +413,7 @@ int H422ToABGR(const uint8* src_y,
 // Convert 10 bit YUV to ARGB with matrix
 // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
 // multiply 10 bit yuv into high bits to allow any number of bits.
-static int H010ToAR30Matrix(const uint16* src_y,
+static int I010ToAR30Matrix(const uint16* src_y,
                             int src_stride_y,
                             const uint16* src_u,
                             int src_stride_u,
@@ -425,12 +425,10 @@ static int H010ToAR30Matrix(const uint16* src_y,
                             int width,
                             int height) {
   int y;
-  void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
+  void (*I210ToAR30Row)(const uint16* y_buf, const uint16* u_buf,
                         const uint16* v_buf, uint8* rgb_buf,
                         const struct YuvConstants* yuvconstants, int width) =
-      I210ToARGBRow_C;
-  void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToAR30Row_C;
+      I210ToAR30Row_C;
   if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
@@ -440,60 +438,51 @@ static int H010ToAR30Matrix(const uint16* src_y,
     dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
     dst_stride_ar30 = -dst_stride_ar30;
   }
-#if defined(HAS_I210TOARGBROW_SSSE3)
+#if defined(HAS_I210TOAR30ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I210ToARGBRow = I210ToARGBRow_SSSE3;
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I210TOARGBROW_AVX2)
+#if defined(HAS_I210TOAR30ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I210ToARGBRow = I210ToARGBRow_AVX2;
+      I210ToAR30Row = I210ToAR30Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_ARGBTOAR30ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
     }
   }
-#endif
-#if defined(HAS_ARGBTOAR30ROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToAR30Row = ARGBToAR30Row_AVX2;
-    }
-  }
-#endif
-
-  {
-    // Row buffers for 8 bit YUV and RGB.
-    align_buffer_64(row_argb, width * 4);
-
-    for (y = 0; y < height; ++y) {
-      I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
-      ARGBToAR30Row(row_argb, dst_ar30, width);
-      dst_ar30 += dst_stride_ar30;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-
-    free_aligned_buffer_64(row_argb);
-  }
-
   return 0;
 }
 
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16* src_y,
+               int src_stride_y,
+               const uint16* src_u,
+               int src_stride_u,
+               const uint16* src_v,
+               int src_stride_v,
+               uint8* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
 // Convert H010 to AR30.
 LIBYUV_API
 int H010ToAR30(const uint16* src_y,
@@ -506,7 +495,7 @@ int H010ToAR30(const uint16* src_y,
                int dst_stride_ar30,
                int width,
                int height) {
-  return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
                           src_stride_v, dst_ar30, dst_stride_ar30,
                           &kYuvH709Constants, width, height);
 }
diff --git a/source/row_any.cc b/source/row_any.cc
index 9f4725bf5..d52a4a0ad 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -214,6 +214,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
     memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);       \
   }
 
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16, 2, 4, 7)
+#endif
 #ifdef HAS_I210TOARGBROW_SSSE3
 ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
 #endif
diff --git a/source/row_common.cc b/source/row_common.cc
index a0ca90b8a..395f45905 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -11,6 +11,7 @@
 #include "libyuv/row.h"
 
 #include <string.h>  // For memcpy and memset.
+#include <stdio.h>
 
 #include "libyuv/basic_types.h"
 
@@ -31,9 +32,8 @@ static __inline int32 clamp255(int32 v) {
   return (((255 - (v)) >> 31) | (v)) & 255;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32 clamp1023(int32 v) {
+  return (((1023 - (v)) >> 31) | (v)) & 1023;
 }
 
 static __inline uint32 Abs(int32 v) {
@@ -49,15 +49,23 @@ static __inline int32 clamp255(int32 v) {
   return (v > 255) ? 255 : v;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32 clamp1023(int32 v) {
+  return (v > 1023) ? 1023 : v;
 }
 
 static __inline uint32 Abs(int32 v) {
   return (v < 0) ? -v : v;
 }
 #endif  // USE_BRANCHLESS
+static __inline uint32 Clamp(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp255(v));
+}
+
+static __inline uint32 Clamp10(int32 val) {
+  int v = clamp0(val);
+  return (uint32)(clamp1023(v));
+}
 
 #ifdef LIBYUV_LITTLE_ENDIAN
 #define WRITEWORD(p, v) *(uint32*)(p) = v
@@ -1340,6 +1348,56 @@ static __inline void YuvPixel10(uint16 y,
   *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
 }
 
+// C reference code that mimics the YUV 16 bit assembly.
+static __inline void YuvPixel16(int16 y,
+                                int16 u,
+                                int16 v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32 y1 = (uint32)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+
+  if ((int16)(*b & 0xffff) != *b) {
+  	printf("%d vs %d   bb %d y1 %d\n",(int16)*b, *b, bb, y1);
+  }
+
+}
+
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
@@ -1460,6 +1518,48 @@ void I210ToARGBRow_C(const uint16* src_y,
   }
 }
 
+static void StoreAR30(uint8* rgb_buf,
+                      int b,
+                      int g,
+                      int r) {
+  uint32 ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32)g << 10) | ((uint32)r << 20) | 0xc0000000;
+  (*(uint32*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16* src_y,
+                     const uint16* src_u,
+                     const uint16* src_v,
+                     uint8* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
 void I422AlphaToARGBRow_C(const uint8* src_y,
                           const uint8* src_u,
                           const uint8* src_v,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index df6a8c1e3..8ea735081 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1696,7 +1696,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
     "movdqa     160(%[yuvconstants]),%%xmm13                    \n"            \
     "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                    \
+#define YUVTORGB16(yuvconstants)                                  \
   "movdqa     %%xmm0,%%xmm1                                   \n" \
   "movdqa     %%xmm0,%%xmm2                                   \n" \
   "movdqa     %%xmm0,%%xmm3                                   \n" \
@@ -1712,45 +1712,42 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
   "paddsw     %%xmm4,%%xmm0                                   \n" \
   "paddsw     %%xmm4,%%xmm1                                   \n" \
-  "paddsw     %%xmm4,%%xmm2                                   \n" \
-  "psraw      $0x6,%%xmm0                                     \n" \
-  "psraw      $0x6,%%xmm1                                     \n" \
-  "psraw      $0x6,%%xmm2                                     \n" \
-  "packuswb   %%xmm0,%%xmm0                                   \n" \
-  "packuswb   %%xmm1,%%xmm1                                   \n" \
-  "packuswb   %%xmm2,%%xmm2                                   \n"
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS \
   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants) \
-  "movdqa     %%xmm0,%%xmm1                                     \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     96(%[yuvconstants]),%%xmm0                      \n"            \
-    "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     128(%[yuvconstants]),%%xmm1                     \n"            \
-    "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     160(%[yuvconstants]),%%xmm2                     \n"            \
-    "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants) \
+  "movdqa     %%xmm0,%%xmm1                                   \n"            \
+  "movdqa     %%xmm0,%%xmm2                                   \n"            \
+  "movdqa     %%xmm0,%%xmm3                                   \n"            \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n"            \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n"            \
+  "psubw      %%xmm1,%%xmm0                                   \n"            \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n"            \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n"            \
+  "psubw      %%xmm2,%%xmm1                                   \n"            \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n"            \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n"            \
+  "psubw      %%xmm3,%%xmm2                                   \n"            \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n"            \
+  "paddsw     %%xmm4,%%xmm0                                   \n"            \
+  "paddsw     %%xmm4,%%xmm1                                   \n"            \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS
 #endif
 
+#define YUVTORGB(yuvconstants) \
+    YUVTORGB16(yuvconstants)                                      \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
 // Store 8 ARGB values.
 #define STOREARGB \
   "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
@@ -1774,6 +1771,32 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
     "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n"           \
     "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
 
+// Store 8 AR30 values.
+#define STOREAR30 \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
@@ -1827,9 +1850,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_rgb24])            \n"
-    "movdqu    %%xmm1,0x8(%[dst_rgb24])              \n"
-    "lea       0x18(%[dst_rgb24]),%[dst_rgb24]           \n"
+    "movq      %%xmm0,(%[dst_rgb24])           \n"
+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
     "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -1908,6 +1931,41 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
   );
 }
 
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16* y_buf,
+                                const uint16* u_buf,
+                                const uint16* v_buf,
+                                uint8* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
                                      const uint8* u_buf,