From c99db063e24d6180740d4adc29e84159096eef2d Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Tue, 1 Oct 2013 01:27:30 +0000
Subject: [PATCH] Change ARGBColorMatrix to a 4x4. BUG=none TEST=planar_unitest
 updates R=johannkoenig@google.com, ryanpetrie@google.com,
 thorcarpenter@google.com

Review URL: https://webrtc-codereview.appspot.com/2320008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@805 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium                   |  2 +-
 include/libyuv/planar_functions.h | 21 ++++++-
 include/libyuv/row.h              | 11 ++--
 include/libyuv/version.h          |  2 +-
 source/planar_functions.cc        | 67 +++++++++++++++++-----
 source/row_common.cc              | 22 +++++---
 source/row_neon.cc                | 40 ++++++++-----
 source/row_posix.cc               | 83 ++++++++++++++-------------
 source/row_win.cc                 | 73 +++++++++++++-----------
 unit_test/planar_test.cc          | 94 ++++++++++++++++++++++++++++---
 10 files changed, 289 insertions(+), 126 deletions(-)

diff --git a/README.chromium b/README.chromium
index 08e3028e7..7eeb253f8 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 804
+Version: 805
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 31490012c..7ca6e7fd8 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -186,15 +186,32 @@ LIBYUV_API
 int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
               int x, int y, int width, int height);
 
+// Deprecated.  Use ARGBColorMatrix instead.
 // Apply a matrix rotation to each ARGB pixel.
 // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
 // The first 4 coefficients apply to B, G, R, A and produce B of the output.
 // The next 4 coefficients apply to B, G, R, A and produce G of the output.
 // The last 4 coefficients apply to B, G, R, A and produce R of the output.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
+
+// Temporary API mapper.
+// #define ARGBColorMatrix(d, s, m, x, y, w, h) \
+//    RGBColorMatrix(d, s, m, x, y, w, h)
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
                     const int8* matrix_argb,
-                    int x, int y, int width, int height);
+                    int width, int height);
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 2e78ba0b7..d2a23410b 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -1456,11 +1456,12 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
 
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width);
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
-                             int width);
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
 
 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index a451d05d8..92fd1129b 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 804
+#define LIBYUV_VERSION 805
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 594641d8b..ee0487a73 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -1345,24 +1345,31 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
   return 0;
 }
 
-// Apply a 4x3 matrix rotation to each ARGB pixel.
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
                     const int8* matrix_argb,
-                    int dst_x, int dst_y, int width, int height) {
-  if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+                    int width, int height) {
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
   // Coalesce contiguous rows.
-  if (dst_stride_argb == width * 4) {
-    return ARGBColorMatrix(dst_argb, dst_stride_argb,
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    return ARGBColorMatrix(src_argb, 0,
+                           dst_argb, 0,
                            matrix_argb,
-                           dst_x, dst_y,
                            width * height, 1);
   }
-  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
-                             int width) = ARGBColorMatrixRow_C;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
 #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1373,14 +1380,48 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   for (int y = 0; y < height; ++y) {
-    ARGBColorMatrixRow(dst, matrix_argb, width);
-    dst += dst_stride_argb;
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  return ARGBColorMatrix(const_cast<const uint8*>(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
diff --git a/source/row_common.cc b/source/row_common.cc
index 3ba0cdb69..f412336e5 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -654,21 +654,27 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 }
 
 // Apply color matrix to a row of image. Matrix is signed.
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
   for (int x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int a = dst_argb[3];
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
     dst_argb += 4;
   }
 }
diff --git a/source/row_neon.cc b/source/row_neon.cc
index bf0167540..f721a94dd 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2474,18 +2474,19 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
-                             int width) {
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
   asm volatile (
-    "vld1.8     {q2}, [%2]                     \n"  // load 3 ARGB vectors.
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
 
     ".p2align  2                               \n"
   "1:                                          \n"
-    "vld4.8     {d16, d18, d20, d22}, [%0]     \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
     "vmovl.u8   q9, d18                        \n"  // g
     "vmovl.u8   q10, d20                       \n"  // r
@@ -2493,33 +2494,42 @@ void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
     "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
     "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
     "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
     "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
     "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
     "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
     "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
     "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
     "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
     "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
     "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
     "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqshrun.s16 d16, q12, #7                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #7                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #7                  \n"  // 16 bit to 8 bit R
-    "vst4.8     {d16, d18, d20, d22}, [%0]!    \n"  // store 8 ARGB pixels.
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
     "bgt        1b                             \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(width)       // %1
-  : "r"(matrix_argb)  // %2
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9",
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
     "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
diff --git a/source/row_posix.cc b/source/row_posix.cc
index 56b34ff45..839c7e370 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -4327,62 +4327,67 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
   asm volatile (
-    "movd      "MEMACCESS(2)",%%xmm2           \n"
-    "movd      "MEMACCESS2(0x4,2)",%%xmm3      \n"
-    "movd      "MEMACCESS2(0x8,2)",%%xmm4      \n"
+    "movd      "MEMACCESS(3)",%%xmm2           \n"
+    "movd      "MEMACCESS2(0x4,3)",%%xmm3      \n"
+    "movd      "MEMACCESS2(0x8,3)",%%xmm4      \n"
+    "movd      "MEMACCESS2(0xc,3)",%%xmm5      \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
 
     // 8 pixel loop.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    "MEMACCESS(0)",%%xmm0           \n"
-    "movdqa    "MEMACCESS2(0x10,0)",%%xmm6     \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm7     \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm5           \n"
-    "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm6,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm0                     \n"
-    "psraw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    "MEMACCESS(0)",%%xmm5           \n"
-    "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
     "movdqa    "MEMACCESS(0)",%%xmm6           \n"
     "movdqa    "MEMACCESS2(0x10,0)",%%xmm1     \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm3,%%xmm1                   \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0,"MEMACCESS(0)"           \n"
-    "movdqa    %%xmm1,"MEMACCESS2(0x10,0)"     \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqa    "MEMACCESS(0)",%%xmm1           \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm7     \n"
+    "pmaddubsw %%xmm4,%%xmm1                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqa    "MEMACCESS(0)",%%xmm6           \n"
+    "movdqa    "MEMACCESS2(0x10,0)",%%xmm7     \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0,"MEMACCESS(1)"           \n"
+    "movdqa    %%xmm6,"MEMACCESS2(0x10,1)"     \n"
     "lea       "MEMLEA(0x20,0)",%0             \n"
+    "lea       "MEMLEA(0x20,1)",%1             \n"
     "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "r"(matrix_argb)     // %2
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
   : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
   );
 }
diff --git a/source/row_win.cc b/source/row_win.cc
index 316f29df6..840eb7531 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -5002,57 +5002,62 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
 __declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        edx, [esp + 8]   /* matrix_argb */
-    mov        ecx, [esp + 12]  /* width */
-    movd       xmm2, [edx]
-    movd       xmm3, [edx + 4]
-    movd       xmm4, [edx + 8]
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movd       xmm2, [ecx]
+    movd       xmm3, [ecx + 4]
+    movd       xmm4, [ecx + 8]
+    movd       xmm5, [ecx + 12]
     pshufd     xmm2, xmm2, 0
     pshufd     xmm3, xmm3, 0
     pshufd     xmm4, xmm4, 0
+    pshufd     xmm5, xmm5, 0
+    mov        ecx, [esp + 16]  /* width */
 
     align      16
  convertloop:
     movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqa     xmm7, [eax + 16]
     pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    movdqa     xmm5, [eax]  // G
+    pmaddubsw  xmm7, xmm2
+    movdqa     xmm6, [eax]  // G
     movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm6   // B
-    phaddsw    xmm5, xmm1   // G
-    psraw      xmm0, 7      // B
-    psraw      xmm5, 7      // G
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
     packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqa     xmm1, [eax]  // R
+    movdqa     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
-    phaddsw    xmm5, xmm1
-    psraw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
     movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
+    movdqa     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
     packuswb   xmm6, xmm6   // 8 A values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
     sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm6
     lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
     jg         convertloop
     ret
   }
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index c51d26ea5..009690e07 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -484,9 +484,87 @@ TEST_F(libyuvTest, TestARGBSepia) {
 
 TEST_F(libyuvTest, TestARGBColorMatrix) {
   SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
 
   // Matrix for Sepia.
-  static const int8 kARGBToSepia[] = {
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+    17 / 2, 68 / 2, 35 / 2, 0,
+    22 / 2, 88 / 2, 45 / 2, 0,
+    24 / 2, 98 / 2, 50 / 2, 0,
+    0, 0, 0, 64,  // Copy alpha.
+  };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                  &kRGBToSepia[0], 16, 1);
+  EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  MaskCpuFlags(0);
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                  &kRGBToSepia[0], 1280, 1);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                    &kRGBToSepia[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(libyuvTest, TestRGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+  // Matrix for Sepia.
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
     17, 68, 35, 0,
     22, 88, 45, 0,
     24, 98, 50, 0,
@@ -515,8 +593,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   orig_pixels[3][2] = 192u;
   orig_pixels[3][3] = 224u;
   // Do 16 to test asm version.
-  ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
-  EXPECT_EQ(33u, orig_pixels[0][0]);
+  RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+  EXPECT_EQ(31u, orig_pixels[0][0]);
   EXPECT_EQ(43u, orig_pixels[0][1]);
   EXPECT_EQ(47u, orig_pixels[0][2]);
   EXPECT_EQ(128u, orig_pixels[0][3]);
@@ -524,12 +602,12 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   EXPECT_EQ(175u, orig_pixels[1][1]);
   EXPECT_EQ(195u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(69u, orig_pixels[2][0]);
-  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(67u, orig_pixels[2][0]);
+  EXPECT_EQ(87u, orig_pixels[2][1]);
   EXPECT_EQ(99u, orig_pixels[2][2]);
   EXPECT_EQ(255u, orig_pixels[2][3]);
-  EXPECT_EQ(88u, orig_pixels[3][0]);
-  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(87u, orig_pixels[3][0]);
+  EXPECT_EQ(112u, orig_pixels[3][1]);
   EXPECT_EQ(127u, orig_pixels[3][2]);
   EXPECT_EQ(224u, orig_pixels[3][3]);
 
@@ -540,7 +618,7 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
     orig_pixels[i][3] = i;
   }
   for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-    ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 1280, 1);
+    RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
   }
 }