From 03cd8584e7f7e5d597468ebed20296f9ba3bbe6e Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 22 Sep 2015 12:05:16 -0700
Subject: [PATCH] Read Y channel in read function for yuv conversion.

Allows reader to support YUY2 format.
Also contains fix for win64 build for yuv conversion.

TBR=harryjin@google.com
BUG=libyuv:488

Review URL: https://codereview.chromium.org/1355333002 .
---
 README.chromium          |  2 +-
 include/libyuv/version.h |  2 +-
 source/row_gcc.cc        | 28 ++++++++++-------
 source/row_win.cc        | 66 ++++++++++++++++++++++++----------------
 4 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/README.chromium b/README.chromium
index 3d9883403..72ab64198 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1487
+Version: 1488
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 79bb5439d..5f195d765 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1487
+#define LIBYUV_VERSION 1488
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index af5ca2b52..68846dc8f 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1324,7 +1324,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                             \
@@ -1332,7 +1334,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 2 UV from 411, upsample to 8 UV
 #define READYUV411                                                             \
@@ -1341,13 +1345,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"
+    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
 #define READNV12                                                               \
     "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
 
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB(yuvconstants)                                                 \
@@ -1363,13 +1371,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
     "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
     "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
-    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm3      \n"            \
-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
+    "paddsw     %%xmm4,%%xmm0                                   \n"            \
+    "paddsw     %%xmm4,%%xmm1                                   \n"            \
+    "paddsw     %%xmm4,%%xmm2                                   \n"            \
     "psraw      $0x6,%%xmm0                                     \n"            \
     "psraw      $0x6,%%xmm1                                     \n"            \
     "psraw      $0x6,%%xmm2                                     \n"            \
diff --git a/source/row_win.cc b/source/row_win.cc
index f09d2a75b..e8d3a9a19 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -34,7 +34,9 @@ extern "C" {
     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;
+    u_buf += 4;                                                                \
+    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
+    y_buf += 8;                                                                \
 
 // Convert 8 pixels: 8 UV and 8 Y.
 #define YUVTORGB(YuvConstants)                                                 \
@@ -46,13 +48,11 @@ extern "C" {
     xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0);             \
     xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1);             \
     xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2);             \
-    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    y_buf += 8;                                                                \
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);                                      \
-    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb);            \
-    xmm0 = _mm_adds_epi16(xmm0, xmm3);                                         \
-    xmm1 = _mm_adds_epi16(xmm1, xmm3);                                         \
-    xmm2 = _mm_adds_epi16(xmm2, xmm3);                                         \
+    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
+    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb);            \
+    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
+    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
+    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
     xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
     xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
     xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
@@ -90,12 +90,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          uint8* dst_argb,
                          struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3;
+  __m128i xmm0, xmm1, xmm2, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
   while (width > 0) {
     READYUV422
-    YUVTORGB(YuvConstants)
+    YUVTORGB(yuvconstants)
     STOREARGB
     width -= 8;
   }
@@ -109,12 +109,12 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
                          uint8* dst_argb,
                          struct YuvConstants* yuvconstants,
                          int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3;
+  __m128i xmm0, xmm1, xmm2, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
   while (width > 0) {
     READYUV422
-    YUVTORGB(YuvConstants)
+    YUVTORGB(yuvconstants)
     STOREABGR
     width -= 8;
   }
@@ -1852,6 +1852,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm lea        eax, [eax + 16]                                           \
   }
 
 // Read 8 UV from 422, upsample to 16 UV.
@@ -1862,6 +1864,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm lea        eax, [eax + 16]                                           \
   }
 
 // Read 4 UV from 411, upsample to 16 UV.
@@ -1873,6 +1877,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm lea        eax, [eax + 16]                                           \
   }
 
 // Read 8 UV from NV12, upsample to 16 UV.
@@ -1881,6 +1887,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm lea        eax, [eax + 16]                                           \
   }
 
 // Convert 16 pixels: 16 UV and 16 Y.
@@ -1895,14 +1903,12 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
     __asm vpsubw     ymm0, ymm3, ymm0                                          \
     /* Step 2: Find Y contribution to 16 R,G,B values */                       \
-    __asm vmovdqu    xmm3, [eax]                                               \
-    __asm lea        eax, [eax + 16]                                           \
-    __asm vpermq     ymm3, ymm3, 0xd8                                          \
-    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
-    __asm vpmulhuw   ymm3, ymm3, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpermq     ymm4, ymm4, 0xd8                                          \
+    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
+    __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
+    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
@@ -2286,6 +2292,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm lea        eax, [eax + 8]                                            \
   }
 
 // Read 4 UV from 422, upsample to 8 UV.
@@ -2295,6 +2303,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm lea        esi,  [esi + 4]                                           \
     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm lea        eax, [eax + 8]                                            \
   }
 
 // Read 2 UV from 411, upsample to 8 UV.
@@ -2305,6 +2315,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm punpcklbw  xmm0, xmm1            /* UV */                            \
     __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
     __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm lea        eax, [eax + 8]                                            \
   }
 
 // Read 4 UV from NV12, upsample to 8 UV.
@@ -2312,6 +2324,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax]                                     \
+    __asm lea        eax, [eax + 8]                                            \
   }
 
 // Convert 8 pixels: 8 UV and 8 Y.
@@ -2328,13 +2342,11 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
     __asm movdqa     xmm2, xmmword ptr [YuvConstants + KUVBIASR]               \
     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
     __asm psubw      xmm2, xmm3                                                \
-    __asm movq       xmm3, qword ptr [eax]                                     \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm3                                                \
-    __asm pmulhuw    xmm3, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
+    __asm punpcklbw  xmm4, xmm4                                                \
+    __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
+    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \