Neon RGB To I420

BUG=none TEST=convert_test Review URL: https://webrtc-codereview.appspot.com/936015 git-svn-id: http://libyuv.googlecode.com/svn/trunk@479 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2012-11-07 08:27:24 +00:00 · 2012-11-07 08:27:24 +00:00 · f1daa3db65
commit f1daa3db65
parent dd2d512e5a
9 changed files with 168 additions and 54 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 478
+Version: 479
 License: BSD
 License File: LICENSE

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -197,6 +197,7 @@ extern "C" {
 #define HAS_ARGBTOUV422ROW_NEON
 #define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUVROW_NEON
+#define HAS_RGB565TOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_RGBATOYROW_NEON
@ -354,6 +355,8 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                         int pix);
 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix);
 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
@ -402,7 +405,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
                                 uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width);
+                                 uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
                           uint8* dst_u, uint8* dst_v, int width);
 void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
@ -411,6 +414,10 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
                           uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+                          uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                            uint8* dst_u, uint8* dst_v, int pix);
 void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
                   uint8* dst_u, uint8* dst_v, int width);
 void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
@ -419,6 +426,8 @@ void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
                   uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
                   uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                     uint8* dst_u, uint8* dst_v, int width);

 void ARGBToUV422Row_SSSE3(const uint8* src_argb,
                          uint8* dst_u, uint8* dst_v, int width);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 478
+#define LIBYUV_VERSION 479

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@ -1379,6 +1379,25 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
    src_stride_rgb565 = -src_stride_rgb565;
  }
+
+#if defined(HAS_RGB565TOYROW_NEON)
+  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+      RGB565ToYRow_C;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    RGB565ToYRow = RGB565ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB565ToYRow = RGB565ToYRow_NEON;
+    }
+    if (width >= 16) {
+      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB565ToUVRow = RGB565ToUVRow_NEON;
+      }
+    }
+  }
+#else  // HAS_RGB565TOYROW_NEON
  SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
      RGB565ToARGBRow_C;
@ -1389,15 +1408,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
      RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
    }
  }
-#elif defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToARGBRow = RGB565ToARGBRow_NEON;
-    }
-  }
 #endif
-
  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
@ -1408,20 +1419,6 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    }
  }
 #endif
-
-#if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      RGB565ToYRow_C;
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    RGB565ToYRow = RGB565ToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      RGB565ToYRow = RGB565ToYRow_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUVRow = ARGBToUVRow_NEON;
-      }
-    }
-  }
-#else
  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
      ARGBToYRow_C;
 #if defined(HAS_ARGBTOUVROW_SSSE3)
@ -1438,13 +1435,14 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
 #endif  // HAS_RGB565TOYROW_NEON

  for (int y = 0; y < height - 1; y += 2) {
-    RGB565ToARGBRow(src_rgb565, row, width);
-    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
-    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
 #if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
    RGB565ToYRow(src_rgb565, dst_y, width);
    RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
 #else
+    RGB565ToARGBRow(src_rgb565, row, width);
+    RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
+    ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
    ARGBToYRow(row, dst_y, width);
    ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
 #endif
@ -1454,11 +1452,12 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
    dst_v += dst_stride_v;
  }
  if (height & 1) {
-    RGB565ToARGBRow_C(src_rgb565, row, width);
-    ARGBToUVRow(row, 0, dst_u, dst_v, width);
 #if defined(HAS_RGB565TOYROW_NEON)
+    RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
    RGB565ToYRow(src_rgb565, dst_y, width);
 #else
+    RGB565ToARGBRow(src_rgb565, row, width);
+    ARGBToUVRow(row, 0, dst_u, dst_v, width);
    ARGBToYRow(row, dst_y, width);
 #endif
  }
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -782,9 +782,9 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
  void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
      YUY2ToARGBRow_C;
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {  // posix it 16, win is 8.
    YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@ -826,9 +826,9 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
  void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
      UYVYToARGBRow_C;
 #if defined(HAS_UYVYTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {  // posix it 16, win is 8.
    UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -143,9 +143,9 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
 RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
       7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
-       7, 2, 4)
+       15, 2, 4)
 RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
-       7, 2, 4)
+       15, 2, 4)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
 RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
@ -224,12 +224,12 @@ UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
 UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
 UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
 UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
-#endif
-#ifdef HAS_YUY2TOUVROW_SSE2
 UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
 UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
 #endif
-#ifdef HAS_YUY2TOUVROW_NEON
+#ifdef HAS_ARGBTOUVROW_NEON
+UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4)
+UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2)
 UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
 UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
 #endif
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -349,6 +349,52 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
  }
 }

+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+  for (int x = 0; x < width - 1; x += 2) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b1 = src_rgb565[2] & 0x1f;
+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8 r1 = src_rgb565[3] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 b3 = next_rgb565[2] & 0x1f;
+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8 r3 = next_rgb565[3] >> 3;
+    uint8 ab = (b0 + b1 + b2 + b3);
+    uint8 ag = (g0 + g1 + g2 + g3);
+    uint8 ar = (r0 + r1 + r2 + r3);
+    ab = (ab << 1) | (ab >> 6);
+    ar = (ar << 1) | (ar >> 6);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+    src_rgb565 += 4;
+    next_rgb565 += 4;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 b0 = src_rgb565[0] & 0x1f;
+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8 r0 = src_rgb565[1] >> 3;
+    uint8 b2 = next_rgb565[0] & 0x1f;
+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8 r2 = next_rgb565[1] >> 3;
+    uint8 ab = (b0 + b2);
+    uint8 ag = (g0 + g2);
+    uint8 ar = (r0 + r2);
+    ab = (ab << 2) | (ab >> 4);
+    ag = (ag << 1) | (ag >> 6);
+    ar = (ar << 2) | (ar >> 4);
+    dst_u[0] = RGBToU(ar, ag, ab);
+    dst_v[0] = RGBToV(ar, ag, ab);
+  }
+}
+
 void ARGBToUV444Row_C(const uint8* src_argb,
                      uint8* dst_u, uint8* dst_v, int width) {
  for (int x = 0; x < width; ++x) {
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1114,22 +1114,20 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {

 #ifdef HAS_RGB565TOARGBROW_NEON
 #define RGB565TOARGB                                                           \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vshrn.u16  d6, q0, #8                     \n"  /* R RRRRRxxx           */ \
-    "vshl.u8    d0, d4, #3                     \n"  /* B BBBBB000 upper 5   */ \
-    "vshl.u8    d1, d5, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vbic.u8    d2, d6, d7                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    d4, d0, #5                     \n"  /* B 00000BBB lower 3   */ \
-    "vshr.u8    d5, d1, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vshr.u8    d6, d2, #5                     \n"  /* R 00000RRR lower 3   */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
-    "vorr.u8    d2, d2, d6                     \n"  /* R                    */
+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
  asm volatile (
    "vmov.u8    d3, #255                       \n"  // Alpha
-    "vmov.u8    d7, #7                         \n"  // 5 bit mask
    ".p2align  2                               \n"
  "1:                                          \n"
    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
@ -1207,7 +1205,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
    "+r"(dst_argb),    // %1
    "+r"(pix)          // %2
  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  : "memory", "cc", "q0", "q1", "q2"  // Clobber List
  );
 }
 #endif  // HAS_ARGB4444TOARGBROW_NEON
@ -1765,10 +1763,72 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOUVROW_NEON

+
+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+                        uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "add        %1, %0, %1                     \n"  // src_stride + src_argb
+    "vmov.s16   q10, #112 / 4                  \n"  // UB / VR 0.875 coefficient
+    "vmov.s16   q11, #74 / 4                   \n"  // UG -0.5781 coefficient
+    "vmov.s16   q12, #38 / 4                   \n"  // UR -0.2969 coefficient
+    "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
+    "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
+    "vmov.u16   q15, #0x8080                   \n"  // 128.5
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    RGB565TOARGB
+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+    "vmul.s16   q8, q4, q10                    \n"  // B
+    "vmls.s16   q8, q5, q11                    \n"  // G
+    "vmls.s16   q8, q6, q12                    \n"  // R
+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "vmul.s16   q9, q6, q10                    \n"  // R
+    "vmls.s16   q9, q5, q14                    \n"  // G
+    "vmls.s16   q9, q4, q13                    \n"  // B
+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+    "bgt        1b                             \n"
+  : "+r"(src_rgb565),  // %0
+    "+r"(src_stride_rgb565),  // %1
+    "+r"(dst_u),     // %2
+    "+r"(dst_v),     // %3
+    "+r"(pix)        // %4
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  );
+}
+#endif  // HAS_ARGBTOUVROW_NEON
+
 #ifdef HAS_RGB565TOYROW_NEON
 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
  asm volatile (
-    "vmov.u8    d7, #7                         \n"  // 5 bit mask
    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -611,7 +611,7 @@ TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
 TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
 TESTATOPLANAR(RAW, 3, I420, 2, 2, 4)
 TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5)
 TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 4)
 TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 4)
 TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4)