diff --git a/README.chromium b/README.chromium index 8d7cf524e..440dac6a1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 281 +Version: 282 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index a7d38c1fc..671fb0384 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -275,6 +275,20 @@ int MJPGToARGB(const uint8* sample, int w, int h, int dw, int dh); +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height); + +// Blur ARGB image. +// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned +// to 16 byte boundary. +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4c36aedb3..cba66fc71 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 281 +#define LIBYUV_VERSION 282 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 1100a1d17..6383552eb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1676,6 +1676,86 @@ int MJPGToARGB(const uint8* sample, } #endif +// Computes table of cumulative sum for image where the value is the sum +// of all values above and to the left of the entry. Used by ARGBBlur. +int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height) { + if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { + return -1; + } + void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; +#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; + } +#endif + memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 ints per pixel. + int32* previous_cumsum = dst_cumsum; + for (int y = 0; y < height; ++y) { + ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); + previous_cumsum = dst_cumsum; + dst_cumsum += dst_stride32_cumsum; + src_argb += src_stride_argb; + } + return 0; +} + +// Blur ARGB image. +// Caller should allocate cumsum table of width * height * 16 bytes aligned +// to 16 byte boundary. +int ARGBBlur(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int32* dst_cumsum, int dst_stride32_cumsum, + int width, int height, int radius) { + void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C; +#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + CumulativeSumToAverage = CumulativeSumToAverage_SSE2; + } +#endif + + ARGBComputeCumulativeSum(src_argb, src_stride_argb, + dst_cumsum, dst_stride32_cumsum, + width, height); + + for (int y = 0; y < height; ++y) { + int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; + int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); + int32* cumsum_top_row = &dst_cumsum[top_y * dst_stride32_cumsum]; + int32* cumsum_bot_row = &dst_cumsum[bot_y * dst_stride32_cumsum]; + + // Left clipped. + int area = radius * (bot_y - top_y); + int boxwidth = radius * 4; + int x; + for (x = 0; x < radius + 1; ++x) { + CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); + area += (bot_y - top_y); + boxwidth += 4; + } + + // Middle unclipped. + int n = (width - 1) - radius - x + 1; + CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); + + // Right clipped. + for (x += n; x <= width - 1; ++x) { + area -= (bot_y - top_y); + boxwidth -= 4; + CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); + } + dst_argb += dst_stride_argb; + } + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row.h b/source/row.h index 0254f92ad..58e9b420a 100644 --- a/source/row.h +++ b/source/row.h @@ -76,6 +76,8 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGE_SSE2 #endif // The following are disabled when SSSE3 is available: @@ -105,6 +107,7 @@ typedef __declspec(align(16)) int8 vec8[16]; typedef __declspec(align(16)) uint8 uvec8[16]; typedef __declspec(align(16)) int16 vec16[8]; typedef __declspec(align(16)) uint16 uvec16[8]; +typedef __declspec(align(16)) int32 vec32[4]; typedef __declspec(align(16)) uint32 uvec32[4]; #else // __GNUC__ #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) @@ -112,6 +115,7 @@ typedef int8 __attribute__((vector_size(16))) vec8; typedef uint8 __attribute__((vector_size(16))) uvec8; typedef int16 __attribute__((vector_size(16))) vec16; typedef uint16 __attribute__((vector_size(16))) uvec16; +typedef int32 __attribute__((vector_size(16))) vec32; typedef uint32 __attribute__((vector_size(16))) uvec32; #endif @@ -485,6 +489,17 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); +// Used for blur. +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width); + +void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count); +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index bcc36ef2c..cf5edd4f5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -950,6 +950,35 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } +void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width) { + int32 row_sum[4] = {0, 0, 0, 0}; + for (int x = 0; x < width; ++x) { + row_sum[0] += row[x * 4 + 0]; + row_sum[1] += row[x * 4 + 1]; + row_sum[2] += row[x * 4 + 2]; + row_sum[3] += row[x * 4 + 3]; + cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; + cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; + cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; + cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; + } +} + +void CumulativeSumToAverage_C(const int32* tl, const int32* bl, + int w, int area, uint8* dst, int count) { + float ooa = 1.0f / area; + for (int i = 0; i < count; ++i) { + dst[0] = static_cast((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); + dst[1] = static_cast((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); + dst[2] = static_cast((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); + dst[3] = static_cast((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); + dst += 4; + tl += 4; + bl += 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index 6b4af0855..a1d499abc 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2932,6 +2932,177 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { } #endif // HAS_ARGBSEPIAROW_SSSE3 +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value, inclusive of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width) { + asm volatile ( + "sub %1,%2 \n" + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + "test $0xf,%1 \n" + "jne 49f \n" + + // 4 pixel loop \n" + ".p2align 2 \n" + "40: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm1,%%xmm2 \n" + "punpckhwd %%xmm1,%%xmm3 \n" + "punpckhbw %%xmm1,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "punpcklwd %%xmm1,%%xmm4 \n" + "punpckhwd %%xmm1,%%xmm5 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqa (%1,%2,1),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm0 \n" + "movdqa 0x10(%1,%2,1),%%xmm3 \n" + "paddd %%xmm0,%%xmm3 \n" + "paddd %%xmm4,%%xmm0 \n" + "movdqa 0x20(%1,%2,1),%%xmm4 \n" + "paddd %%xmm0,%%xmm4 \n" + "paddd %%xmm5,%%xmm0 \n" + "movdqa 0x30(%1,%2,1),%%xmm5 \n" + "paddd %%xmm0,%%xmm5 \n" + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "movdqa %%xmm4,0x20(%1) \n" + "movdqa %%xmm5,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + ".p2align 2 \n" + "10: \n" + "movd (%0),%%xmm2 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpcklwd %%xmm4,%%xmm2 \n" + "paddd %%xmm2,%%xmm0 \n" + "movdqu (%1,%2,1),%%xmm2 \n" + "paddd %%xmm0,%%xmm2 \n" + "movdqu %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + + "19: \n" + : "+r"(row), // %0 + "+r"(cumsum), // %1 + "+r"(previous_cumsum), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + +#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) { + asm volatile ( + "movd %5,%%xmm4 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "rcpss %%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "sub $0x4,%3 \n" + "jl 49f \n" + + // 4 pixel loop \n" + ".p2align 2 \n" + "40: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "psubd (%0,%4,4),%%xmm0 \n" + "psubd 0x10(%0,%4,4),%%xmm1 \n" + "psubd 0x20(%0,%4,4),%%xmm2 \n" + "psubd 0x30(%0,%4,4),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "psubd 0x10(%1),%%xmm1 \n" + "psubd 0x20(%1),%%xmm2 \n" + "psubd 0x30(%1),%%xmm3 \n" + "paddd (%1,%4,4),%%xmm0 \n" + "paddd 0x10(%1,%4,4),%%xmm1 \n" + "paddd 0x20(%1,%4,4),%%xmm2 \n" + "paddd 0x30(%1,%4,4),%%xmm3 \n" + "lea 0x40(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm1,%%xmm1 \n" + "mulps %%xmm4,%%xmm0 \n" + "mulps %%xmm4,%%xmm1 \n" + "cvtdq2ps %%xmm2,%%xmm2 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "cvtps2dq %%xmm1,%%xmm1 \n" + "cvtps2dq %%xmm2,%%xmm2 \n" + "cvtps2dq %%xmm3,%%xmm3 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jge 40b \n" + + "49: \n" + "add $0x3,%3 \n" + "jl 19f \n" + + // 1 pixel loop \n" + ".p2align 2 \n" + "10: \n" + "movdqa (%0),%%xmm0 \n" + "psubd (%0,%4,4),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "psubd (%1),%%xmm0 \n" + "paddd (%1,%4,4),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "mulps %%xmm4,%%xmm0 \n" + "cvtps2dq %%xmm0,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%2) \n" + "lea 0x4(%2),%2 \n" + "sub $0x1,%3 \n" + "jge 10b \n" + "19: \n" + : "+r"(topleft), // %0 + "+r"(botleft), // %1 + "+r"(dst), // %2 + "+rm"(count) // %3 + : "r"(static_cast(width)), // %4 + "rm"(area) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 + + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 3c1ac42cd..2fd6f5c02 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3011,6 +3011,197 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { } } #endif // HAS_ARGBSEPIAROW_SSSE3 + +#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 +// Consider float CumulativeSum. +// Consider calling CumulativeSum one row at time as needed. +// Consider circular CumulativeSum buffer of radius * 2 + 1 height. +// Convert cumulative sum for an area to an average for 1 pixel. +// topleft is pointer to top left of CumulativeSum buffer for area. +// botleft is pointer to bottom left of CumulativeSum buffer. +// width is offset from left to right of area in CumulativeSum buffer measured +// in number of ints. +// area is the number of pixels in the area being averaged. +// dst points to pixel to store result to. +// count is number of averaged pixels to produce. +// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte +// aligned. +void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, + int width, int area, uint8* dst, int count) { + __asm { + mov eax, topleft // eax topleft + mov esi, botleft // esi botleft + mov edx, width + movd xmm4, area + mov edi, dst + mov ecx, count + cvtdq2ps xmm4, xmm4 + rcpss xmm4, xmm4 // 1.0f / area + pshufd xmm4, xmm4, 0 + sub ecx, 4 + jl l4b + + // 4 pixel loop + align 4 + l4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area + cvtdq2ps xmm1, xmm1 + mulps xmm0, xmm4 + mulps xmm1, xmm4 + cvtdq2ps xmm2, xmm2 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + cvtps2dq xmm0, xmm0 + cvtps2dq xmm1, xmm1 + cvtps2dq xmm2, xmm2 + cvtps2dq xmm3, xmm3 + packssdw xmm0, xmm1 + packssdw xmm2, xmm3 + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movdqa xmm0, [eax] + psubd xmm0, [eax + edx * 4] + lea eax, [eax + 16] + psubd xmm0, [esi] + paddd xmm0, [esi + edx * 4] + lea esi, [esi + 16] + cvtdq2ps xmm0, xmm0 + mulps xmm0, xmm4 + cvtps2dq xmm0, xmm0 + packssdw xmm0, xmm0 + packuswb xmm0, xmm0 + movd dword ptr [edi], xmm0 + lea edi, [edi + 4] + sub ecx, 1 + jge l1 + l1b: + } +} +#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 + +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 +// Creates a table of cumulative sums where each value is a sum of all values +// above and to the left of the value. +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, + int32* previous_cumsum, int width) { + __asm { + mov eax, row + mov edx, cumsum + mov esi, previous_cumsum + mov ecx, width + sub esi, edx + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + sub ecx, 4 + jl l4b + test edx, 15 + jne l4b + + // 4 pixel loop + align 4 + l4: + movdqu xmm2, [eax] // 4 argb pixels 16 bytes. + lea eax, [eax + 16] + movdqa xmm4, xmm2 + + punpcklbw xmm2, xmm1 + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm1 + punpckhwd xmm3, xmm1 + + punpckhbw xmm4, xmm1 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm1 + punpckhwd xmm5, xmm1 + + paddd xmm0, xmm2 + movdqa xmm2, [edx + esi] // previous row above. + paddd xmm2, xmm0 + + paddd xmm0, xmm3 + movdqa xmm3, [edx + esi + 16] + paddd xmm3, xmm0 + + paddd xmm0, xmm4 + movdqa xmm4, [edx + esi + 32] + paddd xmm4, xmm0 + + paddd xmm0, xmm5 + movdqa xmm5, [edx + esi + 48] + paddd xmm5, xmm0 + + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + movdqa [edx + 32], xmm4 + movdqa [edx + 48], xmm5 + + lea edx, [edx + 64] + sub ecx, 4 + jge l4 + + l4b: + add ecx, 4 - 1 + jl l1b + + // 1 pixel loop + align 4 + l1: + movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. + lea eax, [eax + 4] + punpcklbw xmm2, xmm4 + punpcklwd xmm2, xmm4 + paddd xmm0, xmm2 + movdqu xmm2, [edx + esi] + paddd xmm2, xmm0 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 1 + jge l1 + + l1b: + } +} +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 + #endif // _M_IX86 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index b327bdd95..cb863cca0 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -353,30 +353,30 @@ TEST_F(libyuvTest, TestAttenuate) { EXPECT_EQ(255, atten_pixels[255][3]); } -TEST_F(libyuvTest, TestAddRow) { - SIMD_ALIGNED(uint8 orig_pixels[256]); - SIMD_ALIGNED(uint16 added_pixels[256]); +TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { + SIMD_ALIGNED(uint8 orig_pixels[16][16][4]); + SIMD_ALIGNED(int32 added_pixels[16][16][4]); - libyuv::AddRow AddRow = GetAddRow(added_pixels, 256); - libyuv::AddRow SubRow = GetSubRow(added_pixels, 256); - - for (int i = 0; i < 256; ++i) { - orig_pixels[i] = i; + for (int y = 0; y < 16; ++y) { + for (int x = 0; x < 16; ++x) { + orig_pixels[y][x][0] = 1u; + orig_pixels[y][x][1] = 2u; + orig_pixels[y][x][2] = 3u; + orig_pixels[y][x][3] = 255u; + } } - memset(added_pixels, 0, sizeof(uint16) * 256); - AddRow(orig_pixels, added_pixels, 256); - EXPECT_EQ(7u, added_pixels[7]); - EXPECT_EQ(250u, added_pixels[250]); - AddRow(orig_pixels, added_pixels, 256); - EXPECT_EQ(14u, added_pixels[7]); - EXPECT_EQ(500u, added_pixels[250]); - SubRow(orig_pixels, added_pixels, 256); - EXPECT_EQ(7u, added_pixels[7]); - EXPECT_EQ(250u, added_pixels[250]); + ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4, + &added_pixels[0][0][0], 16 * 4, + 16, 16); - for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) { - AddRow(orig_pixels, added_pixels, 256); + for (int y = 0; y < 16; ++y) { + for (int x = 0; x < 16; ++x) { + EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]); + EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]); + EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]); + EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]); + } } }