mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-12 06:29:57 +08:00
Box filter for YUV use rows with accumulation buffer for better memory behavior. The old code would do columns accumulated into registers, and then store the result once. This was slow from a memory point of view. The new code does a row of source at a time, updating an accumulation buffer every row. The accumulation buffer is small, and should fit cache. Before each accumulation of N rows, the buffer needs to be reset to zero. If the memset is a bottleneck, it would be faster to do the first row without an add, storing to the accumulation buffer, and then add for the remaining rows.
BUG=425 TESTED=out\release\libyuv_unittest --gtest_filter=*ScaleTo1x1* R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/52659004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1428 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
b07de879b6
commit
05416e2d9a
@ -30,13 +30,11 @@ extern "C" {
|
||||
#define VISUALC_HAS_AVX2 1
|
||||
#endif // VisualStudio >= 2012
|
||||
|
||||
|
||||
// The following are available on all x86 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_FIXEDDIV1_X86
|
||||
#define HAS_FIXEDDIV_X86
|
||||
#define HAS_SCALEADDROWS_SSE2
|
||||
#define HAS_SCALEARGBCOLS_SSE2
|
||||
#define HAS_SCALEARGBCOLSUP2_SSE2
|
||||
#define HAS_SCALEARGBFILTERCOLS_SSSE3
|
||||
@ -50,17 +48,21 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN4_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on VS2012.
|
||||
// The following are available on VS2012:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
|
||||
#define HAS_SCALEADDROWS_AVX2
|
||||
#define HAS_SCALEADDROW_AVX2
|
||||
#define HAS_SCALEROWDOWN2_AVX2
|
||||
#define HAS_SCALEROWDOWN4_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on Visual C:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__)
|
||||
#define HAS_SCALEADDROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SCALEADDROWS_NEON
|
||||
#define HAS_SCALEARGBCOLS_NEON
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
@ -183,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int dst_width);
|
||||
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
|
||||
void ScaleARGBRowDown2_C(const uint8* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_argb, int dst_width);
|
||||
@ -289,14 +289,10 @@ void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
|
||||
void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
@ -442,10 +438,8 @@ void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
|
||||
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRows_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
@ -733,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint8* src_ptr, uint8* dst_ptr) {
|
||||
int j;
|
||||
int j, k;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
@ -750,29 +750,29 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
const uint16* src_ptr, uint8* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_C:
|
||||
((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
|
||||
void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
|
||||
#if defined(HAS_SCALEADDROWS_SSE2)
|
||||
void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
|
||||
ScaleAddRow_C;
|
||||
#if defined(HAS_SCALEADDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleAddRows = ScaleAddRows_Any_SSE2;
|
||||
ScaleAddRow = ScaleAddRow_Any_SSE2;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRows = ScaleAddRows_SSE2;
|
||||
ScaleAddRow = ScaleAddRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEADDROWS_AVX2)
|
||||
#if defined(HAS_SCALEADDROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleAddRows = ScaleAddRows_Any_AVX2;
|
||||
ScaleAddRow = ScaleAddRow_Any_AVX2;
|
||||
if (IS_ALIGNED(src_width, 32)) {
|
||||
ScaleAddRows = ScaleAddRows_AVX2;
|
||||
ScaleAddRow = ScaleAddRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEADDROWS_NEON)
|
||||
#if defined(HAS_SCALEADDROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleAddRows = ScaleAddRows_Any_NEON;
|
||||
ScaleAddRow = ScaleAddRow_Any_NEON;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRows = ScaleAddRows_NEON;
|
||||
ScaleAddRow = ScaleAddRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -786,7 +786,11 @@ static void ScalePlaneBox(int src_width, int src_height,
|
||||
y = max_y;
|
||||
}
|
||||
boxheight = MIN1((y >> 16) - iy);
|
||||
ScaleAddRows(src, src_stride, (uint16*)(row16), src_width, boxheight);
|
||||
memset(row16, 0, src_width * 2);
|
||||
for (k = 0; k < boxheight; ++k) {
|
||||
ScaleAddRow(src, (uint16 *)(row16), src_width);
|
||||
src += src_stride;
|
||||
}
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
@ -798,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
int dst_width, int dst_height,
|
||||
int src_stride, int dst_stride,
|
||||
const uint16* src_ptr, uint16* dst_ptr) {
|
||||
int j;
|
||||
int j, k;
|
||||
// Initial source x/y coordinate and step values as 16.16 fixed point.
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
@ -814,12 +818,12 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
|
||||
const uint32* src_ptr, uint16* dst_ptr) =
|
||||
(dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
|
||||
void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
|
||||
void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
|
||||
ScaleAddRow_16_C;
|
||||
|
||||
#if defined(HAS_SCALEADDROWS_16_SSE2)
|
||||
#if defined(HAS_SCALEADDROW_16_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRows = ScaleAddRows_16_SSE2;
|
||||
ScaleAddRow = ScaleAddRow_16_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -832,7 +836,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
|
||||
y = max_y;
|
||||
}
|
||||
boxheight = MIN1((y >> 16) - iy);
|
||||
ScaleAddRows(src, src_stride, (uint32*)(row32), src_width, boxheight);
|
||||
memset(row32, 0, src_width * 4);
|
||||
for (k = 0; k < boxheight; ++k) {
|
||||
ScaleAddRow(src, (uint32 *)(row32), src_width);
|
||||
src += src_stride;
|
||||
}
|
||||
ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
|
||||
dst_ptr += dst_stride;
|
||||
}
|
||||
|
||||
@ -169,25 +169,23 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
|
||||
#endif
|
||||
|
||||
// Add rows box filter scale down.
|
||||
#define SAANY(NAMEANY, SCALEADDROWS_SIMD, SCALEADDROWS_C, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \
|
||||
uint16* dst_ptr, int src_width, int src_height) { \
|
||||
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \
|
||||
int n = src_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
SCALEADDROWS_SIMD(src_ptr, src_stride, dst_ptr, n, src_height); \
|
||||
SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
SCALEADDROWS_C(src_ptr + n, src_stride, \
|
||||
dst_ptr + n, src_width & MASK, src_height); \
|
||||
SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROWS_SSE2
|
||||
SAANY(ScaleAddRows_Any_SSE2, ScaleAddRows_SSE2, ScaleAddRows_C, 15)
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROWS_AVX2
|
||||
SAANY(ScaleAddRows_Any_AVX2, ScaleAddRows_AVX2, ScaleAddRows_C, 31)
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROWS_NEON
|
||||
SAANY(ScaleAddRows_Any_NEON, ScaleAddRows_NEON, ScaleAddRows_C, 15)
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
|
||||
@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
int x;
|
||||
assert(src_width > 0);
|
||||
assert(src_height > 0);
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
const uint8* s = src_ptr + x;
|
||||
unsigned int sum = 0u;
|
||||
int y;
|
||||
for (y = 0; y < src_height; ++y) {
|
||||
sum += s[0];
|
||||
s += src_stride;
|
||||
}
|
||||
// TODO(fbarchard): Consider limiting height to 256 to avoid overflow.
|
||||
dst_ptr[x] = sum < 65535u ? sum : 65535u;
|
||||
for (x = 0; x < src_width - 1; x += 2) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
dst_ptr[1] += src_ptr[1];
|
||||
src_ptr += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (src_width & 1) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
|
||||
uint32* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
|
||||
int x;
|
||||
assert(src_width > 0);
|
||||
assert(src_height > 0);
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
const uint16* s = src_ptr + x;
|
||||
unsigned int sum = 0u;
|
||||
int y;
|
||||
for (y = 0; y < src_height; ++y) {
|
||||
sum += s[0];
|
||||
s += src_stride;
|
||||
}
|
||||
// No risk of overflow here now
|
||||
dst_ptr[x] = sum;
|
||||
for (x = 0; x < src_width - 1; x += 2) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
dst_ptr[1] += src_ptr[1];
|
||||
src_ptr += 2;
|
||||
dst_ptr += 2;
|
||||
}
|
||||
if (src_width & 1) {
|
||||
dst_ptr[0] += src_ptr[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -800,104 +800,61 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
__declspec(naked)
|
||||
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
push ebp
|
||||
mov esi, [esp + 16 + 4] // src_ptr
|
||||
mov edx, [esp + 16 + 8] // src_stride
|
||||
mov edi, [esp + 16 + 12] // dst_ptr
|
||||
mov ecx, [esp + 16 + 16] // dst_width
|
||||
mov ebx, [esp + 16 + 20] // height
|
||||
mov eax, esi // row pointer
|
||||
mov ebp, ebx // height
|
||||
pxor xmm0, xmm0 // clear accumulators
|
||||
pxor xmm1, xmm1
|
||||
pxor xmm4, xmm4
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
mov edx, [esp + 8] // dst_ptr
|
||||
mov ecx, [esp + 12] // src_width
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// sum rows
|
||||
xloop:
|
||||
movdqu xmm2, [eax] // read 16 pixels
|
||||
lea eax, [eax + edx] // advance to next row
|
||||
movdqa xmm3, xmm2
|
||||
punpcklbw xmm2, xmm4
|
||||
punpckhbw xmm3, xmm4
|
||||
movdqu xmm3, [eax] // read 16 bytes
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm0, [edx] // read 16 words from destination
|
||||
movdqu xmm1, [edx + 16]
|
||||
movdqa xmm2, xmm3
|
||||
punpcklbw xmm2, xmm5
|
||||
punpckhbw xmm3, xmm5
|
||||
paddusw xmm0, xmm2 // sum 16 words
|
||||
paddusw xmm1, xmm3
|
||||
sub ebp, 1
|
||||
jg xloop
|
||||
|
||||
movdqu [edi], xmm0
|
||||
movdqu [edi + 16], xmm1
|
||||
lea edi, [edi + 32] // dst_ptr += 16
|
||||
lea esi, [esi + 16] // src_ptr += 16
|
||||
mov eax, esi // row pointer
|
||||
mov ebp, ebx // height
|
||||
pxor xmm0, xmm0 // clear accumulators
|
||||
pxor xmm1, xmm1
|
||||
movdqu [edx], xmm0 // write 16 words to destination
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 16
|
||||
jg xloop
|
||||
|
||||
pop ebp
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Reads 32xN bytes and produces 32 shorts at a time.
|
||||
// Reads 32 bytes and accumulates to 32 shorts at a time.
|
||||
__declspec(naked)
|
||||
void ScaleAddRows_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height) {
|
||||
void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
push ebx
|
||||
push ebp
|
||||
mov esi, [esp + 16 + 4] // src_ptr
|
||||
mov edx, [esp + 16 + 8] // src_stride
|
||||
mov edi, [esp + 16 + 12] // dst_ptr
|
||||
mov ecx, [esp + 16 + 16] // dst_width
|
||||
mov ebx, [esp + 16 + 20] // height
|
||||
mov eax, esi // row pointer
|
||||
mov ebp, ebx // height
|
||||
vpxor ymm0, ymm0, ymm0 // clear accumulators
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
vpxor ymm4, ymm4, ymm4
|
||||
mov eax, [esp + 4] // src_ptr
|
||||
mov edx, [esp + 8] // dst_ptr
|
||||
mov ecx, [esp + 12] // src_width
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
|
||||
// sum rows
|
||||
xloop:
|
||||
vmovdqu ymm2, [eax] // read 16 pixels
|
||||
vpermq ymm2, ymm2, 0xd8 // unmutate for vpunpck
|
||||
lea eax, [eax + edx] // advance to next row
|
||||
vpunpckhbw ymm3, ymm2, ymm4
|
||||
vpunpcklbw ymm2, ymm2, ymm4
|
||||
vmovdqu ymm3, [eax] // read 32 bytes
|
||||
vpermq ymm3, ymm2, 0xd8 // unmutate for vpunpck
|
||||
lea eax, [eax + 32]
|
||||
vmovdqu ymm0, [edx] // read 32 words from destination
|
||||
vmovdqu ymm1, [edx + 32]
|
||||
vpunpcklbw ymm2, ymm3, ymm5
|
||||
vpunpckhbw ymm3, ymm3, ymm5
|
||||
vpaddusw ymm0, ymm0, ymm2 // sum 16 words
|
||||
vpaddusw ymm1, ymm1, ymm3
|
||||
sub ebp, 1
|
||||
jg xloop
|
||||
|
||||
vmovdqu [edi], ymm0
|
||||
vmovdqu [edi + 32], ymm1
|
||||
lea edi, [edi + 64] // dst_ptr
|
||||
lea esi, [esi + 32] // src_ptr
|
||||
mov eax, esi // row pointer
|
||||
mov ebp, ebx // height
|
||||
vpxor ymm0, ymm0, ymm0 // clear accumulators
|
||||
vpxor ymm1, ymm1, ymm1
|
||||
vmovdqu [edx], ymm0 // write 32 words to destination
|
||||
vmovdqu [edx + 32], ymm1
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 32
|
||||
jg xloop
|
||||
|
||||
pop ebp
|
||||
pop ebx
|
||||
pop edi
|
||||
pop esi
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
|
||||
@ -78,7 +78,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
|
||||
src_u + OFF, \
|
||||
SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
|
||||
@ -211,7 +211,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
memset(dst_y_opt, 101, kWidth * kHeight); \
|
||||
memset(dst_uv_opt, 102, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
|
||||
src_u + OFF, \
|
||||
SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
|
||||
@ -326,7 +326,7 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
memset(dst_v_opt, 103, SUBSAMPLE(kWidth, SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
|
||||
src_uv + OFF, \
|
||||
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
|
||||
@ -435,7 +435,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
|
||||
} \
|
||||
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
|
||||
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
|
||||
src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
|
||||
src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
|
||||
@ -538,7 +538,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
|
||||
} \
|
||||
memset(dst_argb_c, 1, kStrideB * kHeight); \
|
||||
memset(dst_argb_opt, 101, kStrideB * kHeight); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
|
||||
src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
|
||||
dst_argb_c, kWidth * BPP_B, \
|
||||
@ -632,7 +632,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
|
||||
for (int i = 0; i < kHeight; ++i) \
|
||||
for (int j = 0; j < kStride; ++j) \
|
||||
src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
|
||||
dst_y_c, kWidth, \
|
||||
dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
|
||||
@ -690,6 +690,8 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
|
||||
|
||||
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
|
||||
#if defined(__arm__) || defined (__aarch64__)
|
||||
// arm version subsamples by summing 4 pixels then multiplying by matrix with
|
||||
// 4x smaller coefficients which are rounded to nearest integer.
|
||||
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
|
||||
#else
|
||||
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
|
||||
@ -738,7 +740,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
|
||||
memset(dst_y_opt, 101, kWidth * kHeight); \
|
||||
memset(dst_uv_opt, 102, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
|
||||
dst_y_c, kWidth, \
|
||||
dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
|
||||
@ -814,7 +816,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
|
||||
} \
|
||||
memset(dst_argb_c, 1, kStrideB * kHeightB); \
|
||||
memset(dst_argb_opt, 101, kStrideB * kHeightB); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
|
||||
dst_argb_c, kStrideB, \
|
||||
kWidth, NEG kHeight); \
|
||||
@ -858,7 +860,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
|
||||
} \
|
||||
memset(dst_argb_c, 123, kStrideB * kHeightB); \
|
||||
memset(dst_argb_opt, 123, kStrideB * kHeightB); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_B(src_argb, kStrideA, \
|
||||
dst_argb_c, kStrideB, \
|
||||
kWidth, kHeight); \
|
||||
@ -948,7 +950,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither##N) { \
|
||||
} \
|
||||
memset(dst_argb_c, 1, kStrideB * kHeightB); \
|
||||
memset(dst_argb_opt, 101, kStrideB * kHeightB); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \
|
||||
dst_argb_c, kStrideB, \
|
||||
NULL, kWidth, NEG kHeight); \
|
||||
@ -992,7 +994,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither_Random) { \
|
||||
} \
|
||||
memset(dst_argb_c, 123, kStrideB * kHeightB); \
|
||||
memset(dst_argb_opt, 123, kStrideB * kHeightB); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \
|
||||
dst_argb_c, kStrideB, \
|
||||
NULL, kWidth, kHeight); \
|
||||
@ -1051,7 +1053,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \
|
||||
} \
|
||||
memset(dst_argb_c, 1, kStrideA * kHeightA); \
|
||||
memset(dst_argb_opt, 101, kStrideA * kHeightA); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_ATOB(src_argb + OFF, kStrideA, \
|
||||
dst_argb_c, kStrideA, \
|
||||
kWidth, NEG kHeight); \
|
||||
@ -1061,7 +1063,7 @@ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \
|
||||
dst_argb_opt, kStrideA, \
|
||||
kWidth, NEG kHeight); \
|
||||
} \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_ATOB(dst_argb_c, kStrideA, \
|
||||
dst_argb_c, kStrideA, \
|
||||
kWidth, NEG kHeight); \
|
||||
@ -1470,7 +1472,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##Dither##N) { \
|
||||
} \
|
||||
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
|
||||
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, \
|
||||
src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
|
||||
src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user