Blur functions

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/633005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@282 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-06-10 02:40:04 +00:00 · 2012-06-10 02:40:04 +00:00 · f51e87912e
commit f51e87912e
parent 2d9fe08225
9 changed files with 522 additions and 22 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 281
+Version: 282
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -275,6 +275,20 @@ int MJPGToARGB(const uint8* sample,
               int w, int h,
               int dw, int dh);

+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry.  Used by ARGBBlur.
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height);
+
+// Blur ARGB image.
+// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
+// to 16 byte boundary.
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 281
+#define LIBYUV_VERSION 282

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1676,6 +1676,86 @@ int MJPGToARGB(const uint8* sample,
 }
 #endif

+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry.  Used by ARGBBlur.
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+                             int32* dst_cumsum, int dst_stride32_cumsum,
+                             int width, int height) {
+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+    return -1;
+  }
+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+      int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+  }
+#endif
+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 ints per pixel.
+  int32* previous_cumsum = dst_cumsum;
+  for (int y = 0; y < height; ++y) {
+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+    previous_cumsum = dst_cumsum;
+    dst_cumsum += dst_stride32_cumsum;
+    src_argb += src_stride_argb;
+  }
+  return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate cumsum table of width * height * 16 bytes aligned
+// to 16 byte boundary.
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int32* dst_cumsum, int dst_stride32_cumsum,
+             int width, int height, int radius) {
+  void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
+      int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
+  }
+#endif
+
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+                           dst_cumsum, dst_stride32_cumsum,
+                           width, height);
+
+  for (int y = 0; y < height; ++y) {
+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+    int32* cumsum_top_row = &dst_cumsum[top_y * dst_stride32_cumsum];
+    int32* cumsum_bot_row = &dst_cumsum[bot_y * dst_stride32_cumsum];
+
+    // Left clipped.
+    int area = radius * (bot_y - top_y);
+    int boxwidth = radius * 4;
+    int x;
+    for (x = 0; x < radius + 1; ++x) {
+      CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], 1);
+      area += (bot_y - top_y);
+      boxwidth += 4;
+    }
+
+    // Middle unclipped.
+    int n = (width - 1) - radius - x + 1;
+    CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
+                           boxwidth, area, &dst_argb[x * 4], n);
+
+    // Right clipped.
+    for (x += n; x <= width - 1; ++x) {
+      area -= (bot_y - top_y);
+      boxwidth -= 4;
+      CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
+                             cumsum_bot_row + (x - radius - 1) * 4,
+                             boxwidth, area, &dst_argb[x * 4], 1);
+    }
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row.h
+++ b/source/row.h
@ -76,6 +76,8 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 #define HAS_ARGBGRAYROW_SSSE3
 #define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
 #endif

 // The following are disabled when SSSE3 is available:
@ -105,6 +107,7 @@ typedef __declspec(align(16)) int8 vec8[16];
 typedef __declspec(align(16)) uint8 uvec8[16];
 typedef __declspec(align(16)) int16 vec16[8];
 typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
 typedef __declspec(align(16)) uint32 uvec32[4];
 #else  // __GNUC__
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
@ -112,6 +115,7 @@ typedef int8 __attribute__((vector_size(16))) vec8;
 typedef uint8 __attribute__((vector_size(16))) uvec8;
 typedef int16 __attribute__((vector_size(16))) vec16;
 typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
 typedef uint32 __attribute__((vector_size(16))) uvec32;
 #endif

@ -485,6 +489,17 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width);
 void ARGBSepiaRow_C(uint8* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);

+// Used for blur.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  int32* previous_cumsum, int width);
+
+void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
+                              int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               int32* previous_cumsum, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -950,6 +950,35 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
  }
 }

+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+                               int32* previous_cumsum, int width) {
+  int32 row_sum[4] = {0, 0, 0, 0};
+  for (int x = 0; x < width; ++x) {
+    row_sum[0] += row[x * 4 + 0];
+    row_sum[1] += row[x * 4 + 1];
+    row_sum[2] += row[x * 4 + 2];
+    row_sum[3] += row[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+  }
+}
+
+void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
+                              int w, int area, uint8* dst, int count) {
+  float ooa = 1.0f / area;
+  for (int i = 0; i < count; ++i) {
+    dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst += 4;
+    tl += 4;
+    bl += 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -2932,6 +2932,177 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3

+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  int32* previous_cumsum, int width) {
+  asm volatile (
+    "sub       %1,%2                           \n"
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+    "test      $0xf,%1                         \n"
+    "jne       49f                             \n"
+
+  // 4 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqu    (%0),%%xmm2                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpcklbw %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm1,%%xmm2                   \n"
+    "punpckhwd %%xmm1,%%xmm3                   \n"
+    "punpckhbw %%xmm1,%%xmm4                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "punpcklwd %%xmm1,%%xmm4                   \n"
+    "punpckhwd %%xmm1,%%xmm5                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqa    (%1,%2,1),%%xmm2                \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
+    "paddd     %%xmm0,%%xmm3                   \n"
+    "paddd     %%xmm4,%%xmm0                   \n"
+    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
+    "paddd     %%xmm0,%%xmm4                   \n"
+    "paddd     %%xmm5,%%xmm0                   \n"
+    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
+    "paddd     %%xmm0,%%xmm5                   \n"
+    "movdqa    %%xmm2,(%1)                     \n"
+    "movdqa    %%xmm3,0x10(%1)                 \n"
+    "movdqa    %%xmm4,0x20(%1)                 \n"
+    "movdqa    %%xmm5,0x30(%1)                 \n"
+    "lea       0x40(%1),%1                     \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "movd      (%0),%%xmm2                     \n"
+    "lea       0x4(%0),%0                      \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpcklwd %%xmm4,%%xmm2                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "movdqu    (%1,%2,1),%%xmm2                \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "movdqu    %%xmm2,(%1)                     \n"
+    "lea       0x10(%1),%1                     \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+
+  "19:                                         \n"
+  : "+r"(row),  // %0
+    "+r"(cumsum),  // %1
+    "+r"(previous_cumsum),  // %2
+    "+r"(width)  // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count) {
+  asm volatile (
+    "movd      %5,%%xmm4                       \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "rcpss     %%xmm4,%%xmm4                   \n"
+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "sub       $0x4,%3                         \n"
+    "jl        49f                             \n"
+
+  // 4 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "40:                                         \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    0x20(%0),%%xmm2                 \n"
+    "movdqa    0x30(%0),%%xmm3                 \n"
+    "psubd     (%0,%4,4),%%xmm0                \n"
+    "psubd     0x10(%0,%4,4),%%xmm1            \n"
+    "psubd     0x20(%0,%4,4),%%xmm2            \n"
+    "psubd     0x30(%0,%4,4),%%xmm3            \n"
+    "lea       0x40(%0),%0                     \n"
+    "psubd     (%1),%%xmm0                     \n"
+    "psubd     0x10(%1),%%xmm1                 \n"
+    "psubd     0x20(%1),%%xmm2                 \n"
+    "psubd     0x30(%1),%%xmm3                 \n"
+    "paddd     (%1,%4,4),%%xmm0                \n"
+    "paddd     0x10(%1,%4,4),%%xmm1            \n"
+    "paddd     0x20(%1,%4,4),%%xmm2            \n"
+    "paddd     0x30(%1,%4,4),%%xmm3            \n"
+    "lea       0x40(%1),%1                     \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm1                   \n"
+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+    "mulps     %%xmm4,%%xmm2                   \n"
+    "mulps     %%xmm4,%%xmm3                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "cvtps2dq  %%xmm1,%%xmm1                   \n"
+    "cvtps2dq  %%xmm2,%%xmm2                   \n"
+    "cvtps2dq  %%xmm3,%%xmm3                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0,(%2)                     \n"
+    "lea       0x10(%2),%2                     \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "add       $0x3,%3                         \n"
+    "jl        19f                             \n"
+
+  // 1 pixel loop                              \n"
+    ".p2align  2                               \n"
+  "10:                                         \n"
+    "movdqa    (%0),%%xmm0                     \n"
+    "psubd     (%0,%4,4),%%xmm0                \n"
+    "lea       0x10(%0),%0                     \n"
+    "psubd     (%1),%%xmm0                     \n"
+    "paddd     (%1,%4,4),%%xmm0                \n"
+    "lea       0x10(%1),%1                     \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "mulps     %%xmm4,%%xmm0                   \n"
+    "cvtps2dq  %%xmm0,%%xmm0                   \n"
+    "packssdw  %%xmm0,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,(%2)                     \n"
+    "lea       0x4(%2),%2                      \n"
+    "sub       $0x1,%3                         \n"
+    "jge       10b                             \n"
+  "19:                                         \n"
+  : "+r"(topleft),  // %0
+    "+r"(botleft),  // %1
+    "+r"(dst),      // %2
+    "+rm"(count)    // %3
+  : "r"(static_cast<intptr_t>(width)),  // %4
+    "rm"(area)     // %5
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -3011,6 +3011,197 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
  }
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+//   in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
+// aligned.
+void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
+                                 int width, int area, uint8* dst, int count) {
+  __asm {
+    mov        eax, topleft  // eax topleft
+    mov        esi, botleft  // esi botleft
+    mov        edx, width
+    movd       xmm4, area
+    mov        edi, dst
+    mov        ecx, count
+    cvtdq2ps   xmm4, xmm4
+    rcpss      xmm4, xmm4  // 1.0f / area
+    pshufd     xmm4, xmm4, 0
+    sub        ecx, 4
+    jl         l4b
+
+    // 4 pixel loop
+    align      4
+  l4:
+    // top left
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm1, xmm1
+    mulps      xmm0, xmm4
+    mulps      xmm1, xmm4
+    cvtdq2ps   xmm2, xmm2
+    cvtdq2ps   xmm3, xmm3
+    mulps      xmm2, xmm4
+    mulps      xmm3, xmm4
+    cvtps2dq   xmm0, xmm0
+    cvtps2dq   xmm1, xmm1
+    cvtps2dq   xmm2, xmm2
+    cvtps2dq   xmm3, xmm3
+    packssdw   xmm0, xmm1
+    packssdw   xmm2, xmm3
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    movdqa     xmm0, [eax]
+    psubd      xmm0, [eax + edx * 4]
+    lea        eax, [eax + 16]
+    psubd      xmm0, [esi]
+    paddd      xmm0, [esi + edx * 4]
+    lea        esi, [esi + 16]
+    cvtdq2ps   xmm0, xmm0
+    mulps      xmm0, xmm4
+    cvtps2dq   xmm0, xmm0
+    packssdw   xmm0, xmm0
+    packuswb   xmm0, xmm0
+    movd       dword ptr [edi], xmm0
+    lea        edi, [edi + 4]
+    sub        ecx, 1
+    jge        l1
+  l1b:
+  }
+}
+#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+                                  int32* previous_cumsum, int width) {
+  __asm {
+    mov        eax, row
+    mov        edx, cumsum
+    mov        esi, previous_cumsum
+    mov        ecx, width
+    sub        esi, edx
+    pxor       xmm0, xmm0
+    pxor       xmm1, xmm1
+
+    sub        ecx, 4
+    jl         l4b
+    test       edx, 15
+    jne        l4b
+
+    // 4 pixel loop
+    align      4
+  l4:
+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
+    lea        eax, [eax + 16]
+    movdqa     xmm4, xmm2
+
+    punpcklbw  xmm2, xmm1
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm2, xmm1
+    punpckhwd  xmm3, xmm1
+
+    punpckhbw  xmm4, xmm1
+    movdqa     xmm5, xmm4
+    punpcklwd  xmm4, xmm1
+    punpckhwd  xmm5, xmm1
+
+    paddd      xmm0, xmm2
+    movdqa     xmm2, [edx + esi]  // previous row above.
+    paddd      xmm2, xmm0
+
+    paddd      xmm0, xmm3
+    movdqa     xmm3, [edx + esi + 16]
+    paddd      xmm3, xmm0
+
+    paddd      xmm0, xmm4
+    movdqa     xmm4, [edx + esi + 32]
+    paddd      xmm4, xmm0
+
+    paddd      xmm0, xmm5
+    movdqa     xmm5, [edx + esi + 48]
+    paddd      xmm5, xmm0
+
+    movdqa     [edx], xmm2
+    movdqa     [edx + 16], xmm3
+    movdqa     [edx + 32], xmm4
+    movdqa     [edx + 48], xmm5
+
+    lea        edx, [edx + 64]
+    sub        ecx, 4
+    jge        l4
+
+  l4b:
+    add        ecx, 4 - 1
+    jl         l1b
+
+    // 1 pixel loop
+    align      4
+  l1:
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    lea        eax, [eax + 4]
+    punpcklbw  xmm2, xmm4
+    punpcklwd  xmm2, xmm4
+    paddd      xmm0, xmm2
+    movdqu     xmm2, [edx + esi]
+    paddd      xmm2, xmm0
+    movdqu     [edx], xmm2
+    lea        edx, [edx + 16]
+    sub        ecx, 1
+    jge        l1
+
+ l1b:
+  }
+}
+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
 #endif  // _M_IX86


--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -353,30 +353,30 @@ TEST_F(libyuvTest, TestAttenuate) {
  EXPECT_EQ(255, atten_pixels[255][3]);
 }

-TEST_F(libyuvTest, TestAddRow) {
-  SIMD_ALIGNED(uint8 orig_pixels[256]);
-  SIMD_ALIGNED(uint16 added_pixels[256]);
+TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
+  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+  SIMD_ALIGNED(int32 added_pixels[16][16][4]);

-  libyuv::AddRow AddRow = GetAddRow(added_pixels, 256);
-  libyuv::AddRow SubRow = GetSubRow(added_pixels, 256);
-
-  for (int i = 0; i < 256; ++i) {
-    orig_pixels[i] = i;
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      orig_pixels[y][x][0] = 1u;
+      orig_pixels[y][x][1] = 2u;
+      orig_pixels[y][x][2] = 3u;
+      orig_pixels[y][x][3] = 255u;
+    }
  }
-  memset(added_pixels, 0, sizeof(uint16) * 256);

-  AddRow(orig_pixels, added_pixels, 256);
-  EXPECT_EQ(7u, added_pixels[7]);
-  EXPECT_EQ(250u, added_pixels[250]);
-  AddRow(orig_pixels, added_pixels, 256);
-  EXPECT_EQ(14u, added_pixels[7]);
-  EXPECT_EQ(500u, added_pixels[250]);
-  SubRow(orig_pixels, added_pixels, 256);
-  EXPECT_EQ(7u, added_pixels[7]);
-  EXPECT_EQ(250u, added_pixels[250]);
+  ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+                           &added_pixels[0][0][0], 16 * 4,
+                           16, 16);

-  for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) {
-    AddRow(orig_pixels, added_pixels, 256);
+  for (int y = 0; y < 16; ++y) {
+    for (int x = 0; x < 16; ++x) {
+      EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+      EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+      EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+      EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+    }
  }
 }