mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Blur functions
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/633005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@282 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
2d9fe08225
commit
f51e87912e
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 281
|
||||
Version: 282
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -275,6 +275,20 @@ int MJPGToARGB(const uint8* sample,
|
||||
int w, int h,
|
||||
int dw, int dh);
|
||||
|
||||
// Computes table of cumulative sum for image where the value is the sum
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
int width, int height);
|
||||
|
||||
// Blur ARGB image.
|
||||
// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
|
||||
// to 16 byte boundary.
|
||||
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
int width, int height, int radius);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 281
|
||||
#define LIBYUV_VERSION 282
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -1676,6 +1676,86 @@ int MJPGToARGB(const uint8* sample,
|
||||
}
|
||||
#endif
|
||||
|
||||
// Computes table of cumulative sum for image where the value is the sum
|
||||
// of all values above and to the left of the entry. Used by ARGBBlur.
|
||||
int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
int width, int height) {
|
||||
if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
|
||||
#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
|
||||
}
|
||||
#endif
|
||||
memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 ints per pixel.
|
||||
int32* previous_cumsum = dst_cumsum;
|
||||
for (int y = 0; y < height; ++y) {
|
||||
ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
|
||||
previous_cumsum = dst_cumsum;
|
||||
dst_cumsum += dst_stride32_cumsum;
|
||||
src_argb += src_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Blur ARGB image.
|
||||
// Caller should allocate cumsum table of width * height * 16 bytes aligned
|
||||
// to 16 byte boundary.
|
||||
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int32* dst_cumsum, int dst_stride32_cumsum,
|
||||
int width, int height, int radius) {
|
||||
void (*CumulativeSumToAverage)(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst, int count) = CumulativeSumToAverage_C;
|
||||
#if defined(HAS_CUMULATIVESUMTOAVERAGE_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CumulativeSumToAverage = CumulativeSumToAverage_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
ARGBComputeCumulativeSum(src_argb, src_stride_argb,
|
||||
dst_cumsum, dst_stride32_cumsum,
|
||||
width, height);
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
|
||||
int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
|
||||
int32* cumsum_top_row = &dst_cumsum[top_y * dst_stride32_cumsum];
|
||||
int32* cumsum_bot_row = &dst_cumsum[bot_y * dst_stride32_cumsum];
|
||||
|
||||
// Left clipped.
|
||||
int area = radius * (bot_y - top_y);
|
||||
int boxwidth = radius * 4;
|
||||
int x;
|
||||
for (x = 0; x < radius + 1; ++x) {
|
||||
CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
|
||||
boxwidth, area, &dst_argb[x * 4], 1);
|
||||
area += (bot_y - top_y);
|
||||
boxwidth += 4;
|
||||
}
|
||||
|
||||
// Middle unclipped.
|
||||
int n = (width - 1) - radius - x + 1;
|
||||
CumulativeSumToAverage(cumsum_top_row, cumsum_bot_row,
|
||||
boxwidth, area, &dst_argb[x * 4], n);
|
||||
|
||||
// Right clipped.
|
||||
for (x += n; x <= width - 1; ++x) {
|
||||
area -= (bot_y - top_y);
|
||||
boxwidth -= 4;
|
||||
CumulativeSumToAverage(cumsum_top_row + (x - radius - 1) * 4,
|
||||
cumsum_bot_row + (x - radius - 1) * 4,
|
||||
boxwidth, area, &dst_argb[x * 4], 1);
|
||||
}
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
15
source/row.h
15
source/row.h
@ -76,6 +76,8 @@ extern "C" {
|
||||
#define HAS_YUY2TOYROW_SSE2
|
||||
#define HAS_ARGBGRAYROW_SSSE3
|
||||
#define HAS_ARGBSEPIAROW_SSSE3
|
||||
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
#define HAS_CUMULATIVESUMTOAVERAGE_SSE2
|
||||
#endif
|
||||
|
||||
// The following are disabled when SSSE3 is available:
|
||||
@ -105,6 +107,7 @@ typedef __declspec(align(16)) int8 vec8[16];
|
||||
typedef __declspec(align(16)) uint8 uvec8[16];
|
||||
typedef __declspec(align(16)) int16 vec16[8];
|
||||
typedef __declspec(align(16)) uint16 uvec16[8];
|
||||
typedef __declspec(align(16)) int32 vec32[4];
|
||||
typedef __declspec(align(16)) uint32 uvec32[4];
|
||||
#else // __GNUC__
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
@ -112,6 +115,7 @@ typedef int8 __attribute__((vector_size(16))) vec8;
|
||||
typedef uint8 __attribute__((vector_size(16))) uvec8;
|
||||
typedef int16 __attribute__((vector_size(16))) vec16;
|
||||
typedef uint16 __attribute__((vector_size(16))) uvec16;
|
||||
typedef int32 __attribute__((vector_size(16))) vec32;
|
||||
typedef uint32 __attribute__((vector_size(16))) uvec32;
|
||||
#endif
|
||||
|
||||
@ -485,6 +489,17 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width);
|
||||
void ARGBSepiaRow_C(uint8* dst_argb, int width);
|
||||
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
|
||||
|
||||
// Used for blur.
|
||||
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst, int count);
|
||||
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width);
|
||||
|
||||
void CumulativeSumToAverage_C(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst, int count);
|
||||
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -950,6 +950,35 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width) {
|
||||
int32 row_sum[4] = {0, 0, 0, 0};
|
||||
for (int x = 0; x < width; ++x) {
|
||||
row_sum[0] += row[x * 4 + 0];
|
||||
row_sum[1] += row[x * 4 + 1];
|
||||
row_sum[2] += row[x * 4 + 2];
|
||||
row_sum[3] += row[x * 4 + 3];
|
||||
cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
|
||||
cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
|
||||
cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
|
||||
cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
|
||||
}
|
||||
}
|
||||
|
||||
void CumulativeSumToAverage_C(const int32* tl, const int32* bl,
|
||||
int w, int area, uint8* dst, int count) {
|
||||
float ooa = 1.0f / area;
|
||||
for (int i = 0; i < count; ++i) {
|
||||
dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
|
||||
dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
|
||||
dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
|
||||
dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
|
||||
dst += 4;
|
||||
tl += 4;
|
||||
bl += 4;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -2932,6 +2932,177 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
||||
}
|
||||
#endif // HAS_ARGBSEPIAROW_SSSE3
|
||||
|
||||
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
// Creates a table of cumulative sums where each value is a sum of all values
|
||||
// above and to the left of the value, inclusive of the value.
|
||||
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width) {
|
||||
asm volatile (
|
||||
"sub %1,%2 \n"
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm1,%%xmm1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jl 49f \n"
|
||||
"test $0xf,%1 \n"
|
||||
"jne 49f \n"
|
||||
|
||||
// 4 pixel loop \n"
|
||||
".p2align 2 \n"
|
||||
"40: \n"
|
||||
"movdqu (%0),%%xmm2 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm4 \n"
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"punpcklwd %%xmm1,%%xmm2 \n"
|
||||
"punpckhwd %%xmm1,%%xmm3 \n"
|
||||
"punpckhbw %%xmm1,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"punpcklwd %%xmm1,%%xmm4 \n"
|
||||
"punpckhwd %%xmm1,%%xmm5 \n"
|
||||
"paddd %%xmm2,%%xmm0 \n"
|
||||
"movdqa (%1,%2,1),%%xmm2 \n"
|
||||
"paddd %%xmm0,%%xmm2 \n"
|
||||
"paddd %%xmm3,%%xmm0 \n"
|
||||
"movdqa 0x10(%1,%2,1),%%xmm3 \n"
|
||||
"paddd %%xmm0,%%xmm3 \n"
|
||||
"paddd %%xmm4,%%xmm0 \n"
|
||||
"movdqa 0x20(%1,%2,1),%%xmm4 \n"
|
||||
"paddd %%xmm0,%%xmm4 \n"
|
||||
"paddd %%xmm5,%%xmm0 \n"
|
||||
"movdqa 0x30(%1,%2,1),%%xmm5 \n"
|
||||
"paddd %%xmm0,%%xmm5 \n"
|
||||
"movdqa %%xmm2,(%1) \n"
|
||||
"movdqa %%xmm3,0x10(%1) \n"
|
||||
"movdqa %%xmm4,0x20(%1) \n"
|
||||
"movdqa %%xmm5,0x30(%1) \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jge 40b \n"
|
||||
|
||||
"49: \n"
|
||||
"add $0x3,%3 \n"
|
||||
"jl 19f \n"
|
||||
|
||||
// 1 pixel loop \n"
|
||||
".p2align 2 \n"
|
||||
"10: \n"
|
||||
"movd (%0),%%xmm2 \n"
|
||||
"lea 0x4(%0),%0 \n"
|
||||
"punpcklbw %%xmm4,%%xmm2 \n"
|
||||
"punpcklwd %%xmm4,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm0 \n"
|
||||
"movdqu (%1,%2,1),%%xmm2 \n"
|
||||
"paddd %%xmm0,%%xmm2 \n"
|
||||
"movdqu %%xmm2,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
|
||||
"19: \n"
|
||||
: "+r"(row), // %0
|
||||
"+r"(cumsum), // %1
|
||||
"+r"(previous_cumsum), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
|
||||
#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
|
||||
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst, int count) {
|
||||
asm volatile (
|
||||
"movd %5,%%xmm4 \n"
|
||||
"cvtdq2ps %%xmm4,%%xmm4 \n"
|
||||
"rcpss %%xmm4,%%xmm4 \n"
|
||||
"pshufd $0x0,%%xmm4,%%xmm4 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jl 49f \n"
|
||||
|
||||
// 4 pixel loop \n"
|
||||
".p2align 2 \n"
|
||||
"40: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm2 \n"
|
||||
"movdqa 0x30(%0),%%xmm3 \n"
|
||||
"psubd (%0,%4,4),%%xmm0 \n"
|
||||
"psubd 0x10(%0,%4,4),%%xmm1 \n"
|
||||
"psubd 0x20(%0,%4,4),%%xmm2 \n"
|
||||
"psubd 0x30(%0,%4,4),%%xmm3 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"psubd (%1),%%xmm0 \n"
|
||||
"psubd 0x10(%1),%%xmm1 \n"
|
||||
"psubd 0x20(%1),%%xmm2 \n"
|
||||
"psubd 0x30(%1),%%xmm3 \n"
|
||||
"paddd (%1,%4,4),%%xmm0 \n"
|
||||
"paddd 0x10(%1,%4,4),%%xmm1 \n"
|
||||
"paddd 0x20(%1,%4,4),%%xmm2 \n"
|
||||
"paddd 0x30(%1,%4,4),%%xmm3 \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"cvtdq2ps %%xmm0,%%xmm0 \n"
|
||||
"cvtdq2ps %%xmm1,%%xmm1 \n"
|
||||
"mulps %%xmm4,%%xmm0 \n"
|
||||
"mulps %%xmm4,%%xmm1 \n"
|
||||
"cvtdq2ps %%xmm2,%%xmm2 \n"
|
||||
"cvtdq2ps %%xmm3,%%xmm3 \n"
|
||||
"mulps %%xmm4,%%xmm2 \n"
|
||||
"mulps %%xmm4,%%xmm3 \n"
|
||||
"cvtps2dq %%xmm0,%%xmm0 \n"
|
||||
"cvtps2dq %%xmm1,%%xmm1 \n"
|
||||
"cvtps2dq %%xmm2,%%xmm2 \n"
|
||||
"cvtps2dq %%xmm3,%%xmm3 \n"
|
||||
"packssdw %%xmm1,%%xmm0 \n"
|
||||
"packssdw %%xmm3,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%2) \n"
|
||||
"lea 0x10(%2),%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jge 40b \n"
|
||||
|
||||
"49: \n"
|
||||
"add $0x3,%3 \n"
|
||||
"jl 19f \n"
|
||||
|
||||
// 1 pixel loop \n"
|
||||
".p2align 2 \n"
|
||||
"10: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"psubd (%0,%4,4),%%xmm0 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"psubd (%1),%%xmm0 \n"
|
||||
"paddd (%1,%4,4),%%xmm0 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"cvtdq2ps %%xmm0,%%xmm0 \n"
|
||||
"mulps %%xmm4,%%xmm0 \n"
|
||||
"cvtps2dq %%xmm0,%%xmm0 \n"
|
||||
"packssdw %%xmm0,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movd %%xmm0,(%2) \n"
|
||||
"lea 0x4(%2),%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
"19: \n"
|
||||
: "+r"(topleft), // %0
|
||||
"+r"(botleft), // %1
|
||||
"+r"(dst), // %2
|
||||
"+rm"(count) // %3
|
||||
: "r"(static_cast<intptr_t>(width)), // %4
|
||||
"rm"(area) // %5
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
|
||||
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -3011,6 +3011,197 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBSEPIAROW_SSSE3
|
||||
|
||||
#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
|
||||
// Consider float CumulativeSum.
|
||||
// Consider calling CumulativeSum one row at time as needed.
|
||||
// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
|
||||
// Convert cumulative sum for an area to an average for 1 pixel.
|
||||
// topleft is pointer to top left of CumulativeSum buffer for area.
|
||||
// botleft is pointer to bottom left of CumulativeSum buffer.
|
||||
// width is offset from left to right of area in CumulativeSum buffer measured
|
||||
// in number of ints.
|
||||
// area is the number of pixels in the area being averaged.
|
||||
// dst points to pixel to store result to.
|
||||
// count is number of averaged pixels to produce.
|
||||
// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
|
||||
// aligned.
|
||||
void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst, int count) {
|
||||
__asm {
|
||||
mov eax, topleft // eax topleft
|
||||
mov esi, botleft // esi botleft
|
||||
mov edx, width
|
||||
movd xmm4, area
|
||||
mov edi, dst
|
||||
mov ecx, count
|
||||
cvtdq2ps xmm4, xmm4
|
||||
rcpss xmm4, xmm4 // 1.0f / area
|
||||
pshufd xmm4, xmm4, 0
|
||||
sub ecx, 4
|
||||
jl l4b
|
||||
|
||||
// 4 pixel loop
|
||||
align 4
|
||||
l4:
|
||||
// top left
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
|
||||
// - top right
|
||||
psubd xmm0, [eax + edx * 4]
|
||||
psubd xmm1, [eax + edx * 4 + 16]
|
||||
psubd xmm2, [eax + edx * 4 + 32]
|
||||
psubd xmm3, [eax + edx * 4 + 48]
|
||||
lea eax, [eax + 64]
|
||||
|
||||
// - bottom left
|
||||
psubd xmm0, [esi]
|
||||
psubd xmm1, [esi + 16]
|
||||
psubd xmm2, [esi + 32]
|
||||
psubd xmm3, [esi + 48]
|
||||
|
||||
// + bottom right
|
||||
paddd xmm0, [esi + edx * 4]
|
||||
paddd xmm1, [esi + edx * 4 + 16]
|
||||
paddd xmm2, [esi + edx * 4 + 32]
|
||||
paddd xmm3, [esi + edx * 4 + 48]
|
||||
lea esi, [esi + 64]
|
||||
|
||||
cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
|
||||
cvtdq2ps xmm1, xmm1
|
||||
mulps xmm0, xmm4
|
||||
mulps xmm1, xmm4
|
||||
cvtdq2ps xmm2, xmm2
|
||||
cvtdq2ps xmm3, xmm3
|
||||
mulps xmm2, xmm4
|
||||
mulps xmm3, xmm4
|
||||
cvtps2dq xmm0, xmm0
|
||||
cvtps2dq xmm1, xmm1
|
||||
cvtps2dq xmm2, xmm2
|
||||
cvtps2dq xmm3, xmm3
|
||||
packssdw xmm0, xmm1
|
||||
packssdw xmm2, xmm3
|
||||
packuswb xmm0, xmm2
|
||||
movdqu [edi], xmm0
|
||||
lea edi, [edi + 16]
|
||||
sub ecx, 4
|
||||
jge l4
|
||||
|
||||
l4b:
|
||||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
align 4
|
||||
l1:
|
||||
movdqa xmm0, [eax]
|
||||
psubd xmm0, [eax + edx * 4]
|
||||
lea eax, [eax + 16]
|
||||
psubd xmm0, [esi]
|
||||
paddd xmm0, [esi + edx * 4]
|
||||
lea esi, [esi + 16]
|
||||
cvtdq2ps xmm0, xmm0
|
||||
mulps xmm0, xmm4
|
||||
cvtps2dq xmm0, xmm0
|
||||
packssdw xmm0, xmm0
|
||||
packuswb xmm0, xmm0
|
||||
movd dword ptr [edi], xmm0
|
||||
lea edi, [edi + 4]
|
||||
sub ecx, 1
|
||||
jge l1
|
||||
l1b:
|
||||
}
|
||||
}
|
||||
#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
|
||||
|
||||
#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
// Creates a table of cumulative sums where each value is a sum of all values
|
||||
// above and to the left of the value.
|
||||
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
|
||||
int32* previous_cumsum, int width) {
|
||||
__asm {
|
||||
mov eax, row
|
||||
mov edx, cumsum
|
||||
mov esi, previous_cumsum
|
||||
mov ecx, width
|
||||
sub esi, edx
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm1, xmm1
|
||||
|
||||
sub ecx, 4
|
||||
jl l4b
|
||||
test edx, 15
|
||||
jne l4b
|
||||
|
||||
// 4 pixel loop
|
||||
align 4
|
||||
l4:
|
||||
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
||||
lea eax, [eax + 16]
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
punpcklbw xmm2, xmm1
|
||||
movdqa xmm3, xmm2
|
||||
punpcklwd xmm2, xmm1
|
||||
punpckhwd xmm3, xmm1
|
||||
|
||||
punpckhbw xmm4, xmm1
|
||||
movdqa xmm5, xmm4
|
||||
punpcklwd xmm4, xmm1
|
||||
punpckhwd xmm5, xmm1
|
||||
|
||||
paddd xmm0, xmm2
|
||||
movdqa xmm2, [edx + esi] // previous row above.
|
||||
paddd xmm2, xmm0
|
||||
|
||||
paddd xmm0, xmm3
|
||||
movdqa xmm3, [edx + esi + 16]
|
||||
paddd xmm3, xmm0
|
||||
|
||||
paddd xmm0, xmm4
|
||||
movdqa xmm4, [edx + esi + 32]
|
||||
paddd xmm4, xmm0
|
||||
|
||||
paddd xmm0, xmm5
|
||||
movdqa xmm5, [edx + esi + 48]
|
||||
paddd xmm5, xmm0
|
||||
|
||||
movdqa [edx], xmm2
|
||||
movdqa [edx + 16], xmm3
|
||||
movdqa [edx + 32], xmm4
|
||||
movdqa [edx + 48], xmm5
|
||||
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 4
|
||||
jge l4
|
||||
|
||||
l4b:
|
||||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
align 4
|
||||
l1:
|
||||
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
|
||||
lea eax, [eax + 4]
|
||||
punpcklbw xmm2, xmm4
|
||||
punpcklwd xmm2, xmm4
|
||||
paddd xmm0, xmm2
|
||||
movdqu xmm2, [edx + esi]
|
||||
paddd xmm2, xmm0
|
||||
movdqu [edx], xmm2
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 1
|
||||
jge l1
|
||||
|
||||
l1b:
|
||||
}
|
||||
}
|
||||
#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
|
||||
#endif // _M_IX86
|
||||
|
||||
|
||||
|
||||
@ -353,30 +353,30 @@ TEST_F(libyuvTest, TestAttenuate) {
|
||||
EXPECT_EQ(255, atten_pixels[255][3]);
|
||||
}
|
||||
|
||||
TEST_F(libyuvTest, TestAddRow) {
|
||||
SIMD_ALIGNED(uint8 orig_pixels[256]);
|
||||
SIMD_ALIGNED(uint16 added_pixels[256]);
|
||||
TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
|
||||
SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
|
||||
SIMD_ALIGNED(int32 added_pixels[16][16][4]);
|
||||
|
||||
libyuv::AddRow AddRow = GetAddRow(added_pixels, 256);
|
||||
libyuv::AddRow SubRow = GetSubRow(added_pixels, 256);
|
||||
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
for (int y = 0; y < 16; ++y) {
|
||||
for (int x = 0; x < 16; ++x) {
|
||||
orig_pixels[y][x][0] = 1u;
|
||||
orig_pixels[y][x][1] = 2u;
|
||||
orig_pixels[y][x][2] = 3u;
|
||||
orig_pixels[y][x][3] = 255u;
|
||||
}
|
||||
}
|
||||
memset(added_pixels, 0, sizeof(uint16) * 256);
|
||||
|
||||
AddRow(orig_pixels, added_pixels, 256);
|
||||
EXPECT_EQ(7u, added_pixels[7]);
|
||||
EXPECT_EQ(250u, added_pixels[250]);
|
||||
AddRow(orig_pixels, added_pixels, 256);
|
||||
EXPECT_EQ(14u, added_pixels[7]);
|
||||
EXPECT_EQ(500u, added_pixels[250]);
|
||||
SubRow(orig_pixels, added_pixels, 256);
|
||||
EXPECT_EQ(7u, added_pixels[7]);
|
||||
EXPECT_EQ(250u, added_pixels[250]);
|
||||
ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
|
||||
&added_pixels[0][0][0], 16 * 4,
|
||||
16, 16);
|
||||
|
||||
for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) {
|
||||
AddRow(orig_pixels, added_pixels, 256);
|
||||
for (int y = 0; y < 16; ++y) {
|
||||
for (int x = 0; x < 16; ++x) {
|
||||
EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
|
||||
EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
|
||||
EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
|
||||
EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user