mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Change scale to unaligned movdqu.
BUG=365 TESTED=scale unittests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/22879004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1101 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
9c4c82181b
commit
044f914c29
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1099
|
Version: 1101
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -1748,12 +1748,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
|
void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
|
||||||
ptrdiff_t src_stride_ptr, int width,
|
ptrdiff_t src_stride_ptr, int width,
|
||||||
int source_y_fraction);
|
int source_y_fraction);
|
||||||
void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride_ptr, int width,
|
|
||||||
int source_y_fraction);
|
|
||||||
void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride_ptr, int width,
|
|
||||||
int source_y_fraction);
|
|
||||||
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||||
ptrdiff_t src_stride_ptr, int width,
|
ptrdiff_t src_stride_ptr, int width,
|
||||||
int source_y_fraction);
|
int source_y_fraction);
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1099
|
#define LIBYUV_VERSION 1101
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -1798,27 +1798,17 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
|
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(width, 4)) {
|
if (IS_ALIGNED(width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
|
|
||||||
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
|
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(width, 4)) {
|
if (IS_ALIGNED(width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
|
|
||||||
IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
|
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
|
||||||
|
|||||||
@ -579,11 +579,11 @@ NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
|
|||||||
InterpolateRow_C, 1, 1, 32)
|
InterpolateRow_C, 1, 1, 32)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
#ifdef HAS_INTERPOLATEROW_SSSE3
|
||||||
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
|
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3,
|
||||||
InterpolateRow_C, 1, 1, 15)
|
InterpolateRow_C, 1, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_INTERPOLATEROW_SSE2
|
#ifdef HAS_INTERPOLATEROW_SSE2
|
||||||
NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
|
NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2,
|
||||||
InterpolateRow_C, 1, 1, 15)
|
InterpolateRow_C, 1, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_INTERPOLATEROW_NEON
|
#ifdef HAS_INTERPOLATEROW_NEON
|
||||||
|
|||||||
@ -6322,7 +6322,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
}
|
}
|
||||||
#endif // HAS_INTERPOLATEROW_AVX2
|
#endif // HAS_INTERPOLATEROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
|
||||||
// Bilinear filter 16x2 -> 16x1
|
// Bilinear filter 16x2 -> 16x1
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||||
@ -6356,225 +6355,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
punpcklwd xmm5, xmm5
|
punpcklwd xmm5, xmm5
|
||||||
pshufd xmm5, xmm5, 0
|
pshufd xmm5, xmm5, 0
|
||||||
|
|
||||||
align 4
|
|
||||||
xloop:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
movdqa xmm2, [esi + edx]
|
|
||||||
movdqa xmm1, xmm0
|
|
||||||
punpcklbw xmm0, xmm2
|
|
||||||
punpckhbw xmm1, xmm2
|
|
||||||
pmaddubsw xmm0, xmm5
|
|
||||||
pmaddubsw xmm1, xmm5
|
|
||||||
psrlw xmm0, 7
|
|
||||||
psrlw xmm1, 7
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 25 / 75.
|
|
||||||
align 4
|
|
||||||
xloop25:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
movdqa xmm1, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop25
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 50 / 50.
|
|
||||||
align 4
|
|
||||||
xloop50:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
movdqa xmm1, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop50
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 75 / 25.
|
|
||||||
align 4
|
|
||||||
xloop75:
|
|
||||||
movdqa xmm1, [esi]
|
|
||||||
movdqa xmm0, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop75
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 100 / 0 - Copy row unchanged.
|
|
||||||
align 4
|
|
||||||
xloop100:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop100
|
|
||||||
|
|
||||||
xloop99:
|
|
||||||
pop edi
|
|
||||||
pop esi
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // HAS_INTERPOLATEROW_SSSE3
|
|
||||||
|
|
||||||
#ifdef HAS_INTERPOLATEROW_SSE2
|
|
||||||
// Bilinear filter 16x2 -> 16x1
|
|
||||||
__declspec(naked) __declspec(align(16))
|
|
||||||
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride, int dst_width,
|
|
||||||
int source_y_fraction) {
|
|
||||||
__asm {
|
|
||||||
push esi
|
|
||||||
push edi
|
|
||||||
mov edi, [esp + 8 + 4] // dst_ptr
|
|
||||||
mov esi, [esp + 8 + 8] // src_ptr
|
|
||||||
mov edx, [esp + 8 + 12] // src_stride
|
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
|
||||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
|
||||||
sub edi, esi
|
|
||||||
// Dispatch to specialized filters if applicable.
|
|
||||||
cmp eax, 0
|
|
||||||
je xloop100 // 0 / 256. Blend 100 / 0.
|
|
||||||
cmp eax, 64
|
|
||||||
je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
|
|
||||||
cmp eax, 128
|
|
||||||
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
|
||||||
cmp eax, 192
|
|
||||||
je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
|
|
||||||
|
|
||||||
movd xmm5, eax // xmm5 = y fraction
|
|
||||||
punpcklbw xmm5, xmm5
|
|
||||||
psrlw xmm5, 1
|
|
||||||
punpcklwd xmm5, xmm5
|
|
||||||
punpckldq xmm5, xmm5
|
|
||||||
punpcklqdq xmm5, xmm5
|
|
||||||
pxor xmm4, xmm4
|
|
||||||
|
|
||||||
align 4
|
|
||||||
xloop:
|
|
||||||
movdqa xmm0, [esi] // row0
|
|
||||||
movdqa xmm2, [esi + edx] // row1
|
|
||||||
movdqa xmm1, xmm0
|
|
||||||
movdqa xmm3, xmm2
|
|
||||||
punpcklbw xmm2, xmm4
|
|
||||||
punpckhbw xmm3, xmm4
|
|
||||||
punpcklbw xmm0, xmm4
|
|
||||||
punpckhbw xmm1, xmm4
|
|
||||||
psubw xmm2, xmm0 // row1 - row0
|
|
||||||
psubw xmm3, xmm1
|
|
||||||
paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
|
|
||||||
paddw xmm3, xmm3
|
|
||||||
pmulhw xmm2, xmm5 // scale diff
|
|
||||||
pmulhw xmm3, xmm5
|
|
||||||
paddw xmm0, xmm2 // sum rows
|
|
||||||
paddw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 25 / 75.
|
|
||||||
align 4
|
|
||||||
xloop25:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
movdqa xmm1, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop25
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 50 / 50.
|
|
||||||
align 4
|
|
||||||
xloop50:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
movdqa xmm1, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop50
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 75 / 25.
|
|
||||||
align 4
|
|
||||||
xloop75:
|
|
||||||
movdqa xmm1, [esi]
|
|
||||||
movdqa xmm0, [esi + edx]
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
pavgb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop75
|
|
||||||
jmp xloop99
|
|
||||||
|
|
||||||
// Blend 100 / 0 - Copy row unchanged.
|
|
||||||
align 4
|
|
||||||
xloop100:
|
|
||||||
movdqa xmm0, [esi]
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [esi + edi], xmm0
|
|
||||||
lea esi, [esi + 16]
|
|
||||||
jg xloop100
|
|
||||||
|
|
||||||
xloop99:
|
|
||||||
pop edi
|
|
||||||
pop esi
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // HAS_INTERPOLATEROW_SSE2
|
|
||||||
|
|
||||||
// Bilinear filter 16x2 -> 16x1
|
|
||||||
__declspec(naked) __declspec(align(16))
|
|
||||||
void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride, int dst_width,
|
|
||||||
int source_y_fraction) {
|
|
||||||
__asm {
|
|
||||||
push esi
|
|
||||||
push edi
|
|
||||||
mov edi, [esp + 8 + 4] // dst_ptr
|
|
||||||
mov esi, [esp + 8 + 8] // src_ptr
|
|
||||||
mov edx, [esp + 8 + 12] // src_stride
|
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
|
||||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
|
||||||
sub edi, esi
|
|
||||||
shr eax, 1
|
|
||||||
// Dispatch to specialized filters if applicable.
|
|
||||||
cmp eax, 0
|
|
||||||
je xloop100 // 0 / 128. Blend 100 / 0.
|
|
||||||
cmp eax, 32
|
|
||||||
je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
|
|
||||||
cmp eax, 64
|
|
||||||
je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
|
|
||||||
cmp eax, 96
|
|
||||||
je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
|
|
||||||
|
|
||||||
movd xmm0, eax // high fraction 0..127
|
|
||||||
neg eax
|
|
||||||
add eax, 128
|
|
||||||
movd xmm5, eax // low fraction 128..1
|
|
||||||
punpcklbw xmm5, xmm0
|
|
||||||
punpcklwd xmm5, xmm5
|
|
||||||
pshufd xmm5, xmm5, 0
|
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
xloop:
|
xloop:
|
||||||
movdqu xmm0, [esi]
|
movdqu xmm0, [esi]
|
||||||
@ -6650,7 +6430,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
#ifdef HAS_INTERPOLATEROW_SSE2
|
#ifdef HAS_INTERPOLATEROW_SSE2
|
||||||
// Bilinear filter 16x2 -> 16x1
|
// Bilinear filter 16x2 -> 16x1
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||||
ptrdiff_t src_stride, int dst_width,
|
ptrdiff_t src_stride, int dst_width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
__asm {
|
__asm {
|
||||||
|
|||||||
@ -59,17 +59,10 @@ static void ScalePlaneDown2(int src_width, int src_height,
|
|||||||
}
|
}
|
||||||
#elif defined(HAS_SCALEROWDOWN2_SSE2)
|
#elif defined(HAS_SCALEROWDOWN2_SSE2)
|
||||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
|
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
|
||||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
|
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
|
|
||||||
ScaleRowDown2Box_Unaligned_SSE2);
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) &&
|
|
||||||
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
(filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
|
||||||
ScaleRowDown2Box_SSE2);
|
ScaleRowDown2Box_SSE2);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
|
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
|
||||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
|
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
|
||||||
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
|
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
|
||||||
@ -114,18 +107,10 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
|
|||||||
}
|
}
|
||||||
#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
|
#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
|
||||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
|
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
|
||||||
ScaleRowDown2 = filtering == kFilterNone ?
|
|
||||||
ScaleRowDown2_Unaligned_16_SSE2 :
|
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
|
|
||||||
ScaleRowDown2Box_Unaligned_16_SSE2);
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) &&
|
|
||||||
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
|
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
|
||||||
(filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
|
(filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
|
||||||
ScaleRowDown2Box_16_SSE2);
|
ScaleRowDown2Box_16_SSE2);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
|
#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
|
||||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
|
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
|
||||||
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
|
IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
|
||||||
@ -889,23 +874,17 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(src_width, 16)) {
|
if (IS_ALIGNED(src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(src_width, 16)) {
|
if (IS_ALIGNED(src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
|
||||||
@ -991,23 +970,17 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
||||||
if (IS_ALIGNED(src_width, 16)) {
|
if (IS_ALIGNED(src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSE2;
|
InterpolateRow = InterpolateRow_16_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
||||||
if (IS_ALIGNED(src_width, 16)) {
|
if (IS_ALIGNED(src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSSE3;
|
InterpolateRow = InterpolateRow_16_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
|
||||||
@ -1090,23 +1063,17 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(dst_width, 16)) {
|
if (IS_ALIGNED(dst_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width, 16)) {
|
if (IS_ALIGNED(dst_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
|
||||||
@ -1229,23 +1196,17 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
||||||
if (IS_ALIGNED(dst_width, 16)) {
|
if (IS_ALIGNED(dst_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
|
|
||||||
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSE2;
|
InterpolateRow = InterpolateRow_16_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width, 16)) {
|
if (IS_ALIGNED(dst_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
|
|
||||||
if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSSE3;
|
InterpolateRow = InterpolateRow_16_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
|
||||||
|
|||||||
@ -193,23 +193,17 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(clip_src_width, 16)) {
|
if (IS_ALIGNED(clip_src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(clip_src_width, 16)) {
|
if (IS_ALIGNED(clip_src_width, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
|
||||||
@ -289,23 +283,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(dst_width, 4)) {
|
if (IS_ALIGNED(dst_width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width, 4)) {
|
if (IS_ALIGNED(dst_width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
|
||||||
@ -430,12 +418,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
|
if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
|
||||||
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
|
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(src_width, 8)) {
|
if (IS_ALIGNED(src_width, 8)) {
|
||||||
I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
|
||||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
|
if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
|
||||||
@ -470,23 +455,17 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(dst_width, 4)) {
|
if (IS_ALIGNED(dst_width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width, 4)) {
|
if (IS_ALIGNED(dst_width, 4)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
|
||||||
|
|||||||
@ -888,25 +888,17 @@ void ScalePlaneVertical(int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSE2;
|
InterpolateRow = InterpolateRow_Any_SSE2;
|
||||||
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSE2;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSE2;
|
InterpolateRow = InterpolateRow_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_SSSE3;
|
InterpolateRow = InterpolateRow_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
|
||||||
@ -970,25 +962,17 @@ void ScalePlaneVertical_16(int src_height,
|
|||||||
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
|
if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
InterpolateRow = InterpolateRow_Any_16_SSE2;
|
||||||
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSE2;
|
InterpolateRow = InterpolateRow_16_SSE2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
#if defined(HAS_INTERPOLATEROW_16_SSSE3)
|
||||||
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
|
if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
|
||||||
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
InterpolateRow = InterpolateRow_Any_16_SSSE3;
|
||||||
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
if (IS_ALIGNED(dst_width_bytes, 16)) {
|
||||||
InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
|
|
||||||
if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
|
|
||||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
|
|
||||||
InterpolateRow = InterpolateRow_16_SSSE3;
|
InterpolateRow = InterpolateRow_16_SSSE3;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
#if defined(HAS_INTERPOLATEROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
|
if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
|
||||||
|
|||||||
@ -101,13 +101,13 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"psrlw $0x8,%%xmm0 \n"
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
"psrlw $0x8,%%xmm1 \n"
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
@ -130,8 +130,8 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
"movdqa %%xmm0,%%xmm2 \n"
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
"psrlw $0x8,%%xmm0 \n"
|
||||||
@ -142,7 +142,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
"pavgw %%xmm2,%%xmm0 \n"
|
"pavgw %%xmm2,%%xmm0 \n"
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
"pavgw %%xmm3,%%xmm1 \n"
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
@ -165,8 +165,8 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
|
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
|
||||||
BUNDLEALIGN
|
BUNDLEALIGN
|
||||||
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
|
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
|
||||||
@ -182,112 +182,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
"pavgw %%xmm2,%%xmm0 \n"
|
"pavgw %%xmm2,%%xmm0 \n"
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
"pavgw %%xmm3,%%xmm1 \n"
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
"packuswb %%xmm1,%%xmm0 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
|
||||||
"sub $0x10,%2 \n"
|
|
||||||
"jg 1b \n"
|
|
||||||
: "+r"(src_ptr), // %0
|
|
||||||
"+r"(dst_ptr), // %1
|
|
||||||
"+r"(dst_width) // %2
|
|
||||||
: "r"((intptr_t)(src_stride)) // %3
|
|
||||||
: "memory", "cc"
|
|
||||||
#if defined(__native_client__) && defined(__x86_64__)
|
|
||||||
, "r14"
|
|
||||||
#endif
|
|
||||||
#if defined(__SSE2__)
|
|
||||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
asm volatile (
|
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
|
||||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
|
||||||
"sub $0x10,%2 \n"
|
|
||||||
"jg 1b \n"
|
|
||||||
: "+r"(src_ptr), // %0
|
|
||||||
"+r"(dst_ptr), // %1
|
|
||||||
"+r"(dst_width) // %2
|
|
||||||
:
|
|
||||||
: "memory", "cc"
|
|
||||||
#if defined(__SSE2__)
|
|
||||||
, "xmm0", "xmm1"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
asm volatile (
|
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
||||||
"psrlw $0x8,%%xmm5 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
|
||||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
|
||||||
"pand %%xmm5,%%xmm2 \n"
|
|
||||||
"pand %%xmm5,%%xmm3 \n"
|
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
|
||||||
"sub $0x10,%2 \n"
|
|
||||||
"jg 1b \n"
|
|
||||||
: "+r"(src_ptr), // %0
|
|
||||||
"+r"(dst_ptr), // %1
|
|
||||||
"+r"(dst_width) // %2
|
|
||||||
:
|
|
||||||
: "memory", "cc"
|
|
||||||
#if defined(__SSE2__)
|
|
||||||
, "xmm0", "xmm1", "xmm5"
|
|
||||||
#endif
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
asm volatile (
|
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
||||||
"psrlw $0x8,%%xmm5 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
|
||||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
|
||||||
MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
|
|
||||||
BUNDLEALIGN
|
|
||||||
MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
|
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
|
||||||
"pavgb %%xmm3,%%xmm1 \n"
|
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
|
||||||
"psrlw $0x8,%%xmm0 \n"
|
|
||||||
"movdqa %%xmm1,%%xmm3 \n"
|
|
||||||
"psrlw $0x8,%%xmm1 \n"
|
|
||||||
"pand %%xmm5,%%xmm2 \n"
|
|
||||||
"pand %%xmm5,%%xmm3 \n"
|
|
||||||
"pavgw %%xmm2,%%xmm0 \n"
|
|
||||||
"pavgw %%xmm3,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n"
|
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
@ -315,8 +209,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"pand %%xmm5,%%xmm0 \n"
|
"pand %%xmm5,%%xmm0 \n"
|
||||||
"pand %%xmm5,%%xmm1 \n"
|
"pand %%xmm5,%%xmm1 \n"
|
||||||
@ -348,8 +242,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
|
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
|
||||||
BUNDLEALIGN
|
BUNDLEALIGN
|
||||||
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
|
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
|
||||||
@ -412,8 +306,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"movdqa %%xmm2,%%xmm1 \n"
|
"movdqa %%xmm2,%%xmm1 \n"
|
||||||
"palignr $0x8,%%xmm0,%%xmm1 \n"
|
"palignr $0x8,%%xmm0,%%xmm1 \n"
|
||||||
@ -461,7 +355,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
|
||||||
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
|
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
|
||||||
"pavgb %%xmm7,%%xmm6 \n"
|
"pavgb %%xmm7,%%xmm6 \n"
|
||||||
"pshufb %%xmm2,%%xmm6 \n"
|
"pshufb %%xmm2,%%xmm6 \n"
|
||||||
@ -479,7 +373,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
|||||||
"psrlw $0x2,%%xmm6 \n"
|
"psrlw $0x2,%%xmm6 \n"
|
||||||
"packuswb %%xmm6,%%xmm6 \n"
|
"packuswb %%xmm6,%%xmm6 \n"
|
||||||
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
|
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
|
||||||
BUNDLEALIGN
|
BUNDLEALIGN
|
||||||
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
|
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
@ -533,7 +427,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
|
||||||
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
|
MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
|
||||||
"pavgb %%xmm6,%%xmm7 \n"
|
"pavgb %%xmm6,%%xmm7 \n"
|
||||||
"pavgb %%xmm7,%%xmm6 \n"
|
"pavgb %%xmm7,%%xmm6 \n"
|
||||||
@ -553,7 +447,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
|||||||
"psrlw $0x2,%%xmm6 \n"
|
"psrlw $0x2,%%xmm6 \n"
|
||||||
"packuswb %%xmm6,%%xmm6 \n"
|
"packuswb %%xmm6,%%xmm6 \n"
|
||||||
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
|
"movq %%xmm6," MEMACCESS2(0x8,1) " \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
|
||||||
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
|
MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"pavgb %%xmm6,%%xmm7 \n"
|
"pavgb %%xmm6,%%xmm7 \n"
|
||||||
@ -590,8 +484,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"pshufb %%xmm4,%%xmm0 \n"
|
"pshufb %%xmm4,%%xmm0 \n"
|
||||||
"pshufb %%xmm5,%%xmm1 \n"
|
"pshufb %%xmm5,%%xmm1 \n"
|
||||||
@ -631,7 +525,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
|
MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
|
||||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -679,7 +573,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
|
MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
|
||||||
"movhlps %%xmm0,%%xmm1 \n"
|
"movhlps %%xmm0,%%xmm1 \n"
|
||||||
"movhlps %%xmm6,%%xmm7 \n"
|
"movhlps %%xmm6,%%xmm7 \n"
|
||||||
@ -741,7 +635,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"mov %0,%3 \n"
|
"mov %0,%3 \n"
|
||||||
"add %6,%0 \n"
|
"add %6,%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -753,7 +647,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"2: \n"
|
"2: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
|
||||||
"add %6,%0 \n"
|
"add %6,%0 \n"
|
||||||
"movdqa %%xmm2,%%xmm3 \n"
|
"movdqa %%xmm2,%%xmm3 \n"
|
||||||
"punpcklbw %%xmm4,%%xmm2 \n"
|
"punpcklbw %%xmm4,%%xmm2 \n"
|
||||||
@ -765,8 +659,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"3: \n"
|
"3: \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
|
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||||
"lea " MEMLEA(0x10,3) ",%0 \n"
|
"lea " MEMLEA(0x10,3) ",%0 \n"
|
||||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||||
"sub $0x10,%4 \n"
|
"sub $0x10,%4 \n"
|
||||||
@ -870,14 +764,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(1) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(0) " \n"
|
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||||
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
|
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
|
|
||||||
@ -898,12 +792,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"shufps $0xdd,%%xmm1,%%xmm0 \n"
|
"shufps $0xdd,%%xmm1,%%xmm0 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
@ -923,15 +817,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm2 \n"
|
"movdqa %%xmm0,%%xmm2 \n"
|
||||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pavgb %%xmm2,%%xmm0 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
@ -951,8 +845,8 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||||
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||||
BUNDLEALIGN
|
BUNDLEALIGN
|
||||||
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
|
MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
|
||||||
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
|
MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
|
||||||
@ -964,7 +858,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
|||||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pavgb %%xmm2,%%xmm0 \n"
|
||||||
"sub $0x4,%2 \n"
|
"sub $0x4,%2 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
@ -1003,7 +897,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
|||||||
"punpckldq %%xmm3,%%xmm2 \n"
|
"punpckldq %%xmm3,%%xmm2 \n"
|
||||||
"punpcklqdq %%xmm2,%%xmm0 \n"
|
"punpcklqdq %%xmm2,%%xmm0 \n"
|
||||||
"sub $0x4,%3 \n"
|
"sub $0x4,%3 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(2) " \n"
|
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
@ -1056,7 +950,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
|||||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||||
"pavgb %%xmm2,%%xmm0 \n"
|
"pavgb %%xmm2,%%xmm0 \n"
|
||||||
"sub $0x4,%3 \n"
|
"sub $0x4,%3 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(2) " \n"
|
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
@ -1156,14 +1050,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa " MEMACCESS(1) ",%%xmm0 \n"
|
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
"punpckldq %%xmm0,%%xmm0 \n"
|
"punpckldq %%xmm0,%%xmm0 \n"
|
||||||
"punpckhdq %%xmm1,%%xmm1 \n"
|
"punpckhdq %%xmm1,%%xmm1 \n"
|
||||||
"sub $0x8,%2 \n"
|
"sub $0x8,%2 \n"
|
||||||
"movdqa %%xmm0," MEMACCESS(0) " \n"
|
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||||
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
|
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
|
|
||||||
|
|||||||
@ -105,14 +105,14 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
psrlw xmm0, 8 // isolate odd pixels.
|
psrlw xmm0, 8 // isolate odd pixels.
|
||||||
psrlw xmm1, 8
|
psrlw xmm1, 8
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -135,8 +135,8 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
|
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
||||||
@ -150,7 +150,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
|
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -172,119 +172,6 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||||
psrlw xmm5, 8
|
psrlw xmm5, 8
|
||||||
|
|
||||||
align 4
|
|
||||||
wloop:
|
|
||||||
movdqa xmm0, [eax]
|
|
||||||
movdqa xmm1, [eax + 16]
|
|
||||||
movdqa xmm2, [eax + esi]
|
|
||||||
movdqa xmm3, [eax + esi + 16]
|
|
||||||
lea eax, [eax + 32]
|
|
||||||
pavgb xmm0, xmm2 // average rows
|
|
||||||
pavgb xmm1, xmm3
|
|
||||||
|
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
|
||||||
psrlw xmm0, 8
|
|
||||||
movdqa xmm3, xmm1
|
|
||||||
psrlw xmm1, 8
|
|
||||||
pand xmm2, xmm5
|
|
||||||
pand xmm3, xmm5
|
|
||||||
pavgw xmm0, xmm2
|
|
||||||
pavgw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
|
|
||||||
sub ecx, 16
|
|
||||||
movdqa [edx], xmm0
|
|
||||||
lea edx, [edx + 16]
|
|
||||||
jg wloop
|
|
||||||
|
|
||||||
pop esi
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
|
||||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
|
||||||
__declspec(naked) __declspec(align(16))
|
|
||||||
void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
__asm {
|
|
||||||
mov eax, [esp + 4] // src_ptr
|
|
||||||
// src_stride ignored
|
|
||||||
mov edx, [esp + 12] // dst_ptr
|
|
||||||
mov ecx, [esp + 16] // dst_width
|
|
||||||
|
|
||||||
align 4
|
|
||||||
wloop:
|
|
||||||
movdqu xmm0, [eax]
|
|
||||||
movdqu xmm1, [eax + 16]
|
|
||||||
lea eax, [eax + 32]
|
|
||||||
psrlw xmm0, 8 // isolate odd pixels.
|
|
||||||
psrlw xmm1, 8
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
sub ecx, 16
|
|
||||||
movdqu [edx], xmm0
|
|
||||||
lea edx, [edx + 16]
|
|
||||||
jg wloop
|
|
||||||
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Blends 32x1 rectangle to 16x1.
|
|
||||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
|
||||||
__declspec(naked) __declspec(align(16))
|
|
||||||
void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
__asm {
|
|
||||||
mov eax, [esp + 4] // src_ptr
|
|
||||||
// src_stride
|
|
||||||
mov edx, [esp + 12] // dst_ptr
|
|
||||||
mov ecx, [esp + 16] // dst_width
|
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
|
||||||
psrlw xmm5, 8
|
|
||||||
|
|
||||||
align 4
|
|
||||||
wloop:
|
|
||||||
movdqu xmm0, [eax]
|
|
||||||
movdqu xmm1, [eax + 16]
|
|
||||||
lea eax, [eax + 32]
|
|
||||||
|
|
||||||
movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
|
|
||||||
psrlw xmm0, 8
|
|
||||||
movdqa xmm3, xmm1
|
|
||||||
psrlw xmm1, 8
|
|
||||||
pand xmm2, xmm5
|
|
||||||
pand xmm3, xmm5
|
|
||||||
pavgw xmm0, xmm2
|
|
||||||
pavgw xmm1, xmm3
|
|
||||||
packuswb xmm0, xmm1
|
|
||||||
|
|
||||||
sub ecx, 16
|
|
||||||
movdqu [edx], xmm0
|
|
||||||
lea edx, [edx + 16]
|
|
||||||
jg wloop
|
|
||||||
|
|
||||||
ret
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Blends 32x2 rectangle to 16x1.
|
|
||||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
|
||||||
__declspec(naked) __declspec(align(16))
|
|
||||||
void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
|
|
||||||
ptrdiff_t src_stride,
|
|
||||||
uint8* dst_ptr, int dst_width) {
|
|
||||||
__asm {
|
|
||||||
push esi
|
|
||||||
mov eax, [esp + 4 + 4] // src_ptr
|
|
||||||
mov esi, [esp + 4 + 8] // src_stride
|
|
||||||
mov edx, [esp + 4 + 12] // dst_ptr
|
|
||||||
mov ecx, [esp + 4 + 16] // dst_width
|
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
|
||||||
psrlw xmm5, 8
|
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqu xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
@ -331,8 +218,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pand xmm0, xmm5
|
pand xmm0, xmm5
|
||||||
pand xmm1, xmm5
|
pand xmm1, xmm5
|
||||||
@ -366,16 +253,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
movdqa xmm2, [eax + esi]
|
movdqu xmm2, [eax + esi]
|
||||||
movdqa xmm3, [eax + esi + 16]
|
movdqu xmm3, [eax + esi + 16]
|
||||||
pavgb xmm0, xmm2 // average rows
|
pavgb xmm0, xmm2 // average rows
|
||||||
pavgb xmm1, xmm3
|
pavgb xmm1, xmm3
|
||||||
movdqa xmm2, [eax + esi * 2]
|
movdqu xmm2, [eax + esi * 2]
|
||||||
movdqa xmm3, [eax + esi * 2 + 16]
|
movdqu xmm3, [eax + esi * 2 + 16]
|
||||||
movdqa xmm4, [eax + edi]
|
movdqu xmm4, [eax + edi]
|
||||||
movdqa xmm5, [eax + edi + 16]
|
movdqu xmm5, [eax + edi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm2, xmm4
|
pavgb xmm2, xmm4
|
||||||
pavgb xmm3, xmm5
|
pavgb xmm3, xmm5
|
||||||
@ -429,8 +316,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
movdqa xmm2, xmm1
|
movdqa xmm2, xmm1
|
||||||
palignr xmm1, xmm0, 8
|
palignr xmm1, xmm0, 8
|
||||||
@ -483,8 +370,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax] // pixels 0..7
|
movdqu xmm0, [eax] // pixels 0..7
|
||||||
movdqa xmm1, [eax + esi]
|
movdqu xmm1, [eax + esi]
|
||||||
pavgb xmm0, xmm1
|
pavgb xmm0, xmm1
|
||||||
pshufb xmm0, xmm2
|
pshufb xmm0, xmm2
|
||||||
pmaddubsw xmm0, xmm5
|
pmaddubsw xmm0, xmm5
|
||||||
@ -501,8 +388,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
|||||||
psrlw xmm0, 2
|
psrlw xmm0, 2
|
||||||
packuswb xmm0, xmm0
|
packuswb xmm0, xmm0
|
||||||
movq qword ptr [edx + 8], xmm0
|
movq qword ptr [edx + 8], xmm0
|
||||||
movdqa xmm0, [eax + 16] // pixels 16..23
|
movdqu xmm0, [eax + 16] // pixels 16..23
|
||||||
movdqa xmm1, [eax + esi + 16]
|
movdqu xmm1, [eax + esi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm0, xmm1
|
pavgb xmm0, xmm1
|
||||||
pshufb xmm0, xmm4
|
pshufb xmm0, xmm4
|
||||||
@ -542,8 +429,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax] // pixels 0..7
|
movdqu xmm0, [eax] // pixels 0..7
|
||||||
movdqa xmm1, [eax + esi]
|
movdqu xmm1, [eax + esi]
|
||||||
pavgb xmm1, xmm0
|
pavgb xmm1, xmm0
|
||||||
pavgb xmm0, xmm1
|
pavgb xmm0, xmm1
|
||||||
pshufb xmm0, xmm2
|
pshufb xmm0, xmm2
|
||||||
@ -562,8 +449,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
|||||||
psrlw xmm0, 2
|
psrlw xmm0, 2
|
||||||
packuswb xmm0, xmm0
|
packuswb xmm0, xmm0
|
||||||
movq qword ptr [edx + 8], xmm0
|
movq qword ptr [edx + 8], xmm0
|
||||||
movdqa xmm0, [eax + 16] // pixels 16..23
|
movdqu xmm0, [eax + 16] // pixels 16..23
|
||||||
movdqa xmm1, [eax + esi + 16]
|
movdqu xmm1, [eax + esi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm1, xmm0
|
pavgb xmm1, xmm0
|
||||||
pavgb xmm0, xmm1
|
pavgb xmm0, xmm1
|
||||||
@ -599,8 +486,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
xloop:
|
xloop:
|
||||||
movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
||||||
movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pshufb xmm0, xmm4
|
pshufb xmm0, xmm4
|
||||||
pshufb xmm1, xmm5
|
pshufb xmm1, xmm5
|
||||||
@ -635,8 +522,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
xloop:
|
xloop:
|
||||||
movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
|
movdqu xmm0, [eax] // sum up 3 rows into xmm0/1
|
||||||
movdqa xmm6, [eax + esi]
|
movdqu xmm6, [eax + esi]
|
||||||
movhlps xmm1, xmm0
|
movhlps xmm1, xmm0
|
||||||
movhlps xmm7, xmm6
|
movhlps xmm7, xmm6
|
||||||
punpcklbw xmm0, xmm5
|
punpcklbw xmm0, xmm5
|
||||||
@ -645,7 +532,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
|||||||
punpcklbw xmm7, xmm5
|
punpcklbw xmm7, xmm5
|
||||||
paddusw xmm0, xmm6
|
paddusw xmm0, xmm6
|
||||||
paddusw xmm1, xmm7
|
paddusw xmm1, xmm7
|
||||||
movdqa xmm6, [eax + esi * 2]
|
movdqu xmm6, [eax + esi * 2]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movhlps xmm7, xmm6
|
movhlps xmm7, xmm6
|
||||||
punpcklbw xmm6, xmm5
|
punpcklbw xmm6, xmm5
|
||||||
@ -701,7 +588,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
xloop:
|
xloop:
|
||||||
movdqa xmm0, [eax] // average 2 rows into xmm0
|
movdqu xmm0, [eax] // average 2 rows into xmm0
|
||||||
pavgb xmm0, [eax + esi]
|
pavgb xmm0, [eax + esi]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
|
|
||||||
@ -750,7 +637,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
align 4
|
align 4
|
||||||
xloop:
|
xloop:
|
||||||
// first row
|
// first row
|
||||||
movdqa xmm0, [esi]
|
movdqu xmm0, [esi]
|
||||||
lea eax, [esi + edx]
|
lea eax, [esi + edx]
|
||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0, xmm4
|
punpcklbw xmm0, xmm4
|
||||||
@ -763,7 +650,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
// sum remaining rows
|
// sum remaining rows
|
||||||
align 4
|
align 4
|
||||||
yloop:
|
yloop:
|
||||||
movdqa xmm2, [eax] // read 16 pixels
|
movdqu xmm2, [eax] // read 16 pixels
|
||||||
lea eax, [eax + edx] // advance to next row
|
lea eax, [eax + edx] // advance to next row
|
||||||
movdqa xmm3, xmm2
|
movdqa xmm3, xmm2
|
||||||
punpcklbw xmm2, xmm4
|
punpcklbw xmm2, xmm4
|
||||||
@ -775,8 +662,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
ydone:
|
ydone:
|
||||||
movdqa [edi], xmm0
|
movdqu [edi], xmm0
|
||||||
movdqa [edi + 16], xmm1
|
movdqu [edi + 16], xmm1
|
||||||
lea edi, [edi + 32]
|
lea edi, [edi + 32]
|
||||||
|
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
@ -891,14 +778,14 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0, xmm0
|
punpcklbw xmm0, xmm0
|
||||||
punpckhbw xmm1, xmm1
|
punpckhbw xmm1, xmm1
|
||||||
sub ecx, 32
|
sub ecx, 32
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqu [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -920,12 +807,12 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
shufps xmm0, xmm1, 0xdd
|
shufps xmm0, xmm1, 0xdd
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -947,15 +834,15 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
movdqa xmm2, xmm0
|
movdqa xmm2, xmm0
|
||||||
shufps xmm0, xmm1, 0x88 // even pixels
|
shufps xmm0, xmm1, 0x88 // even pixels
|
||||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||||
pavgb xmm0, xmm2
|
pavgb xmm0, xmm2
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -978,10 +865,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
movdqa xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
movdqa xmm2, [eax + esi]
|
movdqu xmm2, [eax + esi]
|
||||||
movdqa xmm3, [eax + esi + 16]
|
movdqu xmm3, [eax + esi + 16]
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
pavgb xmm0, xmm2 // average rows
|
pavgb xmm0, xmm2 // average rows
|
||||||
pavgb xmm1, xmm3
|
pavgb xmm1, xmm3
|
||||||
@ -990,7 +877,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
|||||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||||
pavgb xmm0, xmm2
|
pavgb xmm0, xmm2
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -1027,7 +914,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
|||||||
punpckldq xmm2, xmm3
|
punpckldq xmm2, xmm3
|
||||||
punpcklqdq xmm0, xmm2
|
punpcklqdq xmm0, xmm2
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -1076,7 +963,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
|||||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||||
pavgb xmm0, xmm2
|
pavgb xmm0, xmm2
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
@ -1267,14 +1154,14 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
|||||||
|
|
||||||
align 4
|
align 4
|
||||||
wloop:
|
wloop:
|
||||||
movdqa xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpckldq xmm0, xmm0
|
punpckldq xmm0, xmm0
|
||||||
punpckhdq xmm1, xmm1
|
punpckhdq xmm1, xmm1
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqu [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
jg wloop
|
jg wloop
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user