Move sub before branch for loops.

Remove CopyRow_x86
Add CopyRow_Any versions for AVX, SSE2 and Neon.
BUG=269
TESTED=local build
R=harryjin@google.com, tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/26209004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2014-11-20 21:14:27 +00:00
parent 813bf9f97d
commit 91f240c5db
14 changed files with 254 additions and 299 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1174 Version: 1175
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -111,7 +111,6 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS #define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2 #define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3
@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_ERMS(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
void CopyRow_X86(const uint8* src, uint8* dst, int count);
void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count);
void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_16_C(const uint16* src, uint16* dst, int count); void CopyRow_16_C(const uint16* src, uint16* dst, int count);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1174 #define LIBYUV_VERSION 1175
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"lea " MEMLEA(0x10, 0) ",%0 \n" "lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n" "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n" "lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n" "psubusb %%xmm3,%%xmm2 \n"
@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pmaddwd %%xmm2,%%xmm2 \n" "pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n" "pshufd $0xee,%%xmm0,%%xmm1 \n"
@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
"pmulld %%xmm5,%%xmm1 \n" "pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n" "paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"sub $0x10,%1 \n"
"paddd %%xmm3,%%xmm1 \n" "paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n"
"jg 1b \n" "jg 1b \n"
"movd %%xmm0,%3 \n" "movd %%xmm0,%3 \n"
: "+r"(src), // %0 : "+r"(src), // %0

View File

@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
lea eax, [eax + 16] lea eax, [eax + 16]
movdqu xmm2, [edx] movdqu xmm2, [edx]
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick movdqa xmm3, xmm1 // abs trick
psubusb xmm1, xmm2 psubusb xmm1, xmm2
psubusb xmm2, xmm3 psubusb xmm2, xmm3
@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
pmaddwd xmm2, xmm2 pmaddwd xmm2, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
paddd xmm0, xmm2 paddd xmm0, xmm2
sub ecx, 16
jg wloop jg wloop
pshufd xmm1, xmm0, 0xee pshufd xmm1, xmm0, 0xee
@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vmovdqu ymm1, [eax] vmovdqu ymm1, [eax]
vmovdqu ymm2, [eax + edx] vmovdqu ymm2, [eax + edx]
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 32
vpsubusb ymm3, ymm1, ymm2 // abs difference trick vpsubusb ymm3, ymm1, ymm2 // abs difference trick
vpsubusb ymm2, ymm2, ymm1 vpsubusb ymm2, ymm2, ymm1
vpor ymm1, ymm2, ymm3 vpor ymm1, ymm2, ymm3
@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpmaddwd ymm1, ymm1, ymm1 vpmaddwd ymm1, ymm1, ymm1
vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2 vpaddd ymm0, ymm0, ymm2
sub ecx, 32
jg wloop jg wloop
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pmulld(0xcd) // pmulld xmm1, xmm5 pmulld(0xcd) // pmulld xmm1, xmm5
paddd xmm3, xmm4 // add 16 results paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords pshufd xmm2, xmm1, 0x0e // upper 2 dwords
@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
pshufd xmm2, xmm1, 0x01 pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
sub ecx, 16
jg wloop jg wloop
movd eax, xmm0 // return hash movd eax, xmm0 // return hash
@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
pmulld xmm1, kHashMul3 pmulld xmm1, kHashMul3
paddd xmm3, xmm4 // add 16 results paddd xmm3, xmm4 // add 16 results
paddd xmm1, xmm2 paddd xmm1, xmm2
sub ecx, 16
paddd xmm1, xmm3 paddd xmm1, xmm3
pshufd xmm2, xmm1, 0x0e // upper 2 dwords pshufd xmm2, xmm1, 0x0e // upper 2 dwords
paddd xmm1, xmm2 paddd xmm1, xmm2
pshufd xmm2, xmm1, 0x01 pshufd xmm2, xmm1, 0x01
paddd xmm1, xmm2 paddd xmm1, xmm2
paddd xmm0, xmm1 paddd xmm0, xmm1
sub ecx, 16
jg wloop jg wloop
movd eax, xmm0 // return hash movd eax, xmm0 // return hash

View File

@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
int width, int height) { int width, int height) {
int y; int y;
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
} }
#endif #endif
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = CopyRow_NEON; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
} }
#endif #endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
dst_stride_v = -dst_stride_v; dst_stride_v = -dst_stride_v;
} }
// CopyRow for rows of just Y in Q420 copied to Y plane of I420. // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_SSE2) #if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;

View File

@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
if (src_y == dst_y && src_stride_y == dst_stride_y) { if (src_y == dst_y && src_stride_y == dst_stride_y) {
return; return;
} }
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_COPYROW_NEON) #if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = CopyRow_NEON; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
} }
#endif #endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
height = 1; height = 1;
src_stride_y = dst_stride_y = 0; src_stride_y = dst_stride_y = 0;
} }
#if defined(HAS_COPYROW_16_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_16_X86;
}
#endif
#if defined(HAS_COPYROW_16_SSE2) #if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2; CopyRow = CopyRow_16_SSE2;

View File

@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride,
MirrorRow = MirrorRow_MIPS_DSPR2; MirrorRow = MirrorRow_MIPS_DSPR2;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;

View File

@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
} }
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
CopyRow = CopyRow_NEON;
}
#endif
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
CopyRow = CopyRow_X86;
}
#endif
#if defined(HAS_COPYROW_SSE2) #if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) { if (TestCpuFlag(kCpuHasSSE2)) {
CopyRow = CopyRow_SSE2; CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX) #if defined(HAS_COPYROW_AVX)
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { if (TestCpuFlag(kCpuHasAVX)) {
CopyRow = CopyRow_AVX; CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;
} }
#endif #endif
#if defined(HAS_COPYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
}
#endif
#if defined(HAS_COPYROW_MIPS) #if defined(HAS_COPYROW_MIPS)
if (TestCpuFlag(kCpuHasMIPS)) { if (TestCpuFlag(kCpuHasMIPS)) {
CopyRow = CopyRow_MIPS; CopyRow = CopyRow_MIPS;

View File

@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
#endif #endif
#undef NANY #undef NANY
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \ #define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \ int n = width & ~MASK; \
@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
#endif #endif
#undef MANY #undef MANY
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
COPY_SIMD(src_y, dst_y, n); \
} \
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
}
#ifdef HAS_COPYROW_AVX
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
#endif
#ifdef HAS_COPYROW_SSE2
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
#endif
#ifdef HAS_COPYROW_NEON
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif
#undef MANY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -296,9 +296,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n" "lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_rgb24), // %0 : "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -337,9 +337,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"pshufb %%xmm4,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n" "lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_raw), // %0 : "+r"(src_raw), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -725,9 +725,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -765,9 +765,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -837,10 +837,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -910,10 +910,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm0 \n"
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -961,7 +961,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"psraw $0x8,%%xmm2 \n" "psraw $0x8,%%xmm2 \n"
"packsswb %%xmm2,%%xmm0 \n" "packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
@ -980,6 +979,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -1038,10 +1038,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -1080,9 +1080,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -1145,10 +1145,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_bgra0), // %0 : "+r"(src_bgra0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -1186,9 +1186,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -1223,9 +1223,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n" "psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n" "packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -1288,10 +1288,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_abgr0), // %0 : "+r"(src_abgr0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -1357,10 +1357,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"psraw $0x8,%%xmm1 \n" "psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n" "packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movlps %%xmm0," MEMACCESS(1) " \n" "movlps %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_rgba0), // %0 : "+r"(src_rgba0), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -2186,9 +2186,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"1: \n" "1: \n"
MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
@ -2215,9 +2215,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpermq $0x4e,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
@ -2249,9 +2249,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
"pshuflw $0x1b,%%xmm0,%%xmm0 \n" "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
"pshufhw $0x1b,%%xmm0,%%xmm0 \n" "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n" "pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1)",%1 \n" "lea " MEMLEA(0x10,1)",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
@ -2285,10 +2285,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n" "pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
"movlpd %%xmm0," MEMACCESS(1) " \n" "movlpd %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $8,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
@ -2322,9 +2322,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(-0x10,0) ",%0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
@ -2346,13 +2346,13 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile ( asm volatile (
"vmovdqa %3,%%ymm5 \n" "vmovdqu %3,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
@ -2574,21 +2574,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
} }
#endif // HAS_COPYROW_AVX #endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_X86
void CopyRow_X86(const uint8* src, uint8* dst, int width) {
size_t width_tmp = (size_t)(width);
asm volatile (
"shr $0x2,%2 \n"
"rep movsl " MEMMOVESTRING(0,1) " \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
:
: "memory", "cc"
);
}
#endif // HAS_COPYROW_X86
#ifdef HAS_COPYROW_ERMS #ifdef HAS_COPYROW_ERMS
// Multiple of 1. // Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
@ -2894,9 +2879,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -3006,9 +2991,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_yuy2), // %0 : "+r"(src_yuy2), // %0
@ -3119,9 +3104,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
"vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"sub $0x20,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_uyvy), // %0 : "+r"(src_uyvy), // %0
@ -3263,9 +3248,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n" "lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n" "jge 10b \n"
"19: \n" "19: \n"
@ -3295,9 +3280,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jge 41b \n" "jge 41b \n"
"49: \n" "49: \n"
@ -3326,9 +3311,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n" "lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 91b \n" "jge 91b \n"
"99: \n" "99: \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
@ -3398,9 +3383,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n" "lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n" "jge 10b \n"
"19: \n" "19: \n"
@ -3428,9 +3413,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jge 40b \n" "jge 40b \n"
"49: \n" "49: \n"
@ -3457,9 +3442,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"paddusb %%xmm2,%%xmm0 \n" "paddusb %%xmm2,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x4,2) ",%2 \n" "lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 91b \n" "jge 91b \n"
"99: \n" "99: \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
@ -3505,9 +3490,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -3558,9 +3543,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n" "por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -3603,9 +3588,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
"vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpor %%ymm6,%%ymm0,%%ymm0 \n" "vpor %%ymm6,%%ymm0,%%ymm0 \n"
"sub $0x8,%2 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -3651,9 +3636,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pmulhuw %%xmm2,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n"
"lea " MEMLEA(0x10,0) ",%0 \n" "lea " MEMLEA(0x10,0) ",%0 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -3723,9 +3708,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"sub $0x8,%2 \n"
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -3776,10 +3761,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm3,%%xmm0 \n" "punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n" "punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -3853,10 +3838,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm5,%%xmm0 \n" "punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n" "punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
@ -3919,11 +3904,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"movdqa %%xmm0,%%xmm6 \n" "movdqa %%xmm0,%%xmm6 \n"
"punpcklwd %%xmm1,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n" "punpckhwd %%xmm1,%%xmm6 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -3972,9 +3957,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"paddw %%xmm4,%%xmm1 \n" "paddw %%xmm4,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n" "por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n" "lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x4,%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(width) // %1 "+r"(width) // %1
@ -4011,9 +3996,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n" "psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -4050,9 +4035,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n"
"pmulhuw %%xmm3,%%xmm1 \n" "pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
@ -4119,9 +4104,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n" "movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
@ -4179,9 +4164,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"movdqu " MEMACCESS(1) ",%%xmm1 \n" "movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"psubusb %%xmm1,%%xmm0 \n" "psubusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb0), // %0 : "+r"(src_argb0), // %0
"+r"(src_argb1), // %1 "+r"(src_argb1), // %1
@ -4264,9 +4249,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n" "psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n" "pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%4 \n"
MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
"lea " MEMLEA(0x8,0) ",%0 \n" "lea " MEMLEA(0x8,0) ",%0 \n"
"sub $0x8,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
@ -4322,9 +4307,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
"psubw %%xmm0,%%xmm1 \n" "psubw %%xmm0,%%xmm1 \n"
"pmaxsw %%xmm1,%%xmm0 \n" "pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%3 \n"
MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
"lea " MEMLEA(0x8,0) ",%0 \n" "lea " MEMLEA(0x8,0) ",%0 \n"
"sub $0x8,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_y0), // %0 : "+r"(src_y0), // %0
"+r"(src_y1), // %1 "+r"(src_y1), // %1
@ -4375,12 +4360,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpckhwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm0,%%xmm0 \n"
"por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm1," MEMACCESS(2) " \n" "movdqu %%xmm1," MEMACCESS(2) " \n"
"movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n" "lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
@ -4414,9 +4399,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n" "lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n" "paddusb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
@ -4466,12 +4451,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"movdqa %%xmm1,%%xmm7 \n" "movdqa %%xmm1,%%xmm7 \n"
"punpcklwd %%xmm0,%%xmm7 \n" "punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n" "punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
"movdqu %%xmm6," MEMACCESS(2) " \n" "movdqu %%xmm6," MEMACCESS(2) " \n"
"movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n" "lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_sobelx), // %0 : "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1 "+r"(src_sobely), // %1
@ -4757,9 +4742,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm0 \n" "punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n" "addps %%xmm4,%%xmm3 \n"
"sub $0x4,%4 \n"
"movq %%xmm0," MEMACCESS2(0x08,2) " \n" "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n" "jge 40b \n"
"49: \n" "49: \n"
@ -4775,9 +4760,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"addps %%xmm7,%%xmm2 \n" "addps %%xmm7,%%xmm2 \n"
"movd %%xmm0,%k1 \n" "movd %%xmm0,%k1 \n"
MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
"sub $0x1,%4 \n"
"movd %%xmm0," MEMACCESS(2) " \n" "movd %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x04,2) ",%2 \n" "lea " MEMLEA(0x04,2) ",%2 \n"
"sub $0x1,%4 \n"
"jge 10b \n" "jge 10b \n"
"19: \n" "19: \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -4836,9 +4821,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm1 \n" "psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4849,9 +4834,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm1) MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n" "jg 25b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4861,9 +4846,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"movdqu " MEMACCESS(1) ",%%xmm0 \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 50b \n" "jg 50b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4874,9 +4859,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm0) MEMOPREG(movdqu,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n" "jg 75b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4884,9 +4869,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
LABELALIGN LABELALIGN
"100: \n" "100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 100b \n" "jg 100b \n"
"99: \n" "99: \n"
@ -4952,9 +4937,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"paddw %%xmm2,%%xmm0 \n" "paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm3,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4965,9 +4950,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 25b \n" "jg 25b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4977,9 +4962,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqu " MEMACCESS(1) ",%%xmm0 \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 50b \n" "jg 50b \n"
"jmp 99f \n" "jmp 99f \n"
@ -4990,9 +4975,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 75b \n" "jg 75b \n"
"jmp 99f \n" "jmp 99f \n"
@ -5000,9 +4985,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
LABELALIGN LABELALIGN
"100: \n" "100: \n"
"movdqu " MEMACCESS(1) ",%%xmm0 \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 100b \n" "jg 100b \n"
"99: \n" "99: \n"
@ -5037,9 +5022,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n" "punpckldq %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n" "movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
@ -5070,9 +5055,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"pand %%xmm5,%%xmm1 \n" "pand %%xmm5,%%xmm1 \n"
"packssdw %%xmm1,%%xmm0 \n" "packssdw %%xmm1,%%xmm0 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n" "movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_bayer), // %1 "+r"(dst_bayer), // %1
@ -5099,10 +5084,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n" "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -5129,10 +5114,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
"lea " MEMLEA(0x40,0) ",%0 \n" "lea " MEMLEA(0x40,0) ",%0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n" "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"sub $0x10,%2 \n"
"vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n"
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n" "lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -5196,9 +5181,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x1b,%%xmm1,%%xmm1 \n" "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
"pshuflw $0x1b,%%xmm1,%%xmm1 \n" "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 123b \n" "jg 123b \n"
"jmp 99f \n" "jmp 99f \n"
@ -5214,9 +5199,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x39,%%xmm1,%%xmm1 \n" "pshufhw $0x39,%%xmm1,%%xmm1 \n"
"pshuflw $0x39,%%xmm1,%%xmm1 \n" "pshuflw $0x39,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 321b \n" "jg 321b \n"
"jmp 99f \n" "jmp 99f \n"
@ -5232,9 +5217,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0x93,%%xmm1,%%xmm1 \n" "pshufhw $0x93,%%xmm1,%%xmm1 \n"
"pshuflw $0x93,%%xmm1,%%xmm1 \n" "pshuflw $0x93,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 2103b \n" "jg 2103b \n"
"jmp 99f \n" "jmp 99f \n"
@ -5250,9 +5235,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"pshufhw $0xc6,%%xmm1,%%xmm1 \n" "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
"pshuflw $0xc6,%%xmm1,%%xmm1 \n" "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%3 \n"
"jg 3012b \n" "jg 3012b \n"
"99: \n" "99: \n"
@ -5394,9 +5379,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
"cvttps2dq %%xmm4,%%xmm4 \n" "cvttps2dq %%xmm4,%%xmm4 \n"
"packuswb %%xmm4,%%xmm0 \n" "packuswb %%xmm4,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n"
"sub $0x2,%2 \n"
"movq %%xmm0," MEMACCESS(1) " \n" "movq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x2,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -5435,9 +5420,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
"vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
"sub $0x2,%2 \n"
"vmovq %%xmm0," MEMACCESS(1) " \n" "vmovq %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x8,1) ",%1 \n" "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x2,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -5597,9 +5582,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"mov %b0," MEMACCESS2(0xe,3) " \n" "mov %b0," MEMACCESS2(0xe,3) " \n"
"movzb " MEMACCESS2(0xf,2) ",%0 \n" "movzb " MEMACCESS2(0xf,2) ",%0 \n"
"mov %b0," MEMACCESS2(0xf,3) " \n" "mov %b0," MEMACCESS2(0xf,3) " \n"
"sub $0x4,%4 \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"lea " MEMLEA(0x10,3) ",%3 \n" "lea " MEMLEA(0x10,3) ",%3 \n"
"sub $0x4,%4 \n"
"jg 1b \n" "jg 1b \n"
: "+d"(pixel_temp), // %0 : "+d"(pixel_temp), // %0
"+a"(table_temp), // %1 "+a"(table_temp), // %1

View File

@ -313,9 +313,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
pshufb xmm3, xmm4 pshufb xmm3, xmm4
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
por xmm3, xmm5 por xmm3, xmm5
sub ecx, 16
movdqu [edx + 48], xmm3 movdqu [edx + 48], xmm3
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -353,9 +353,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
pshufb xmm3, xmm4 pshufb xmm3, xmm4
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
por xmm3, xmm5 por xmm3, xmm5
sub ecx, 16
movdqu [edx + 48], xmm3 movdqu [edx + 48], xmm3
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -728,9 +728,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -764,9 +764,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -782,7 +782,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToY vbroadcastf128 ymm4, kARGBToY
vbroadcastf128 ymm5, kAddY16 vbroadcastf128 ymm5, kAddY16
vmovdqa ymm6, kPermdARGBToY_AVX vmovdqu ymm6, kPermdARGBToY_AVX
align 4 align 4
convertloop: convertloop:
@ -802,9 +802,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpackuswb ymm0, ymm0, ymm2 // mutates. vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
vpaddb ymm0, ymm0, ymm5 vpaddb ymm0, ymm0, ymm5
sub ecx, 32
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -822,7 +822,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov ecx, [esp + 12] /* pix */ mov ecx, [esp + 12] /* pix */
vbroadcastf128 ymm4, kARGBToYJ vbroadcastf128 ymm4, kARGBToYJ
vbroadcastf128 ymm5, kAddYJ64 vbroadcastf128 ymm5, kAddYJ64
vmovdqa ymm6, kPermdARGBToY_AVX vmovdqu ymm6, kPermdARGBToY_AVX
align 4 align 4
convertloop: convertloop:
@ -843,9 +843,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vpsrlw ymm2, ymm2, 7 vpsrlw ymm2, ymm2, 7
vpackuswb ymm0, ymm0, ymm2 // mutates. vpackuswb ymm0, ymm0, ymm2 // mutates.
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
sub ecx, 32
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg convertloop jg convertloop
vzeroupper vzeroupper
@ -880,9 +880,9 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -914,9 +914,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -948,9 +948,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
psrlw xmm2, 7 psrlw xmm2, 7
packuswb xmm0, xmm2 packuswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -1015,10 +1015,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1087,10 +1087,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
packsswb xmm0, xmm1 packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1152,10 +1152,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
vpaddb ymm0, ymm0, ymm5 // -> unsigned vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values // step 3 - store 16 U and 16 V values
sub ecx, 32
vextractf128 [edx], ymm0, 0 // U vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 32
jg convertloop jg convertloop
pop edi pop edi
@ -1197,7 +1197,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
psraw xmm2, 8 psraw xmm2, 8
packsswb xmm0, xmm2 packsswb xmm0, xmm2
paddb xmm0, xmm5 paddb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu xmm0, [eax] // V movdqu xmm0, [eax] // V
@ -1217,6 +1216,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
lea eax, [eax + 64] lea eax, [eax + 64]
movdqu [edx + edi], xmm0 movdqu [edx + edi], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1272,10 +1272,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1342,10 +1342,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1413,10 +1413,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -1484,10 +1484,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
paddb xmm0, xmm5 // -> unsigned paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values // step 3 - store 8 U and 8 V values
sub ecx, 16
movlps qword ptr [edx], xmm0 // U movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 16
jg convertloop jg convertloop
pop edi pop edi
@ -2043,9 +2043,9 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
por xmm3, xmm2 // BG por xmm3, xmm2 // BG
por xmm1, xmm3 // BGR por xmm1, xmm3 // BGR
packssdw xmm0, xmm1 packssdw xmm0, xmm1
sub ecx, 8
movdqu [edx], xmm0 // store 8 pixels of RGB565 movdqu [edx], xmm0 // store 8 pixels of RGB565
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 8
jg convertloop jg convertloop
pop edi pop edi
@ -2411,9 +2411,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
convertloop: convertloop:
movdqu xmm0, [eax - 16 + ecx] movdqu xmm0, [eax - 16 + ecx]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -2434,9 +2434,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu ymm0, [eax - 32 + ecx] vmovdqu ymm0, [eax - 32 + ecx]
vpshufb ymm0, ymm0, ymm5 vpshufb ymm0, ymm0, ymm5
vpermq ymm0, ymm0, 0x4e // swap high and low halfs vpermq ymm0, ymm0, 0x4e // swap high and low halfs
sub ecx, 32
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -2462,9 +2462,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
pshuflw xmm0, xmm0, 0x1b // swap words pshuflw xmm0, xmm0, 0x1b // swap words
pshufhw xmm0, xmm0, 0x1b pshufhw xmm0, xmm0, 0x1b
pshufd xmm0, xmm0, 0x4e // swap qwords pshufd xmm0, xmm0, 0x4e // swap qwords
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -2495,10 +2495,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm1 pshufb xmm0, xmm1
sub ecx, 8
movlpd qword ptr [edx], xmm0 movlpd qword ptr [edx], xmm0
movhpd qword ptr [edx + edi], xmm0 movhpd qword ptr [edx + edi], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg convertloop jg convertloop
pop edi pop edi
@ -2527,9 +2527,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
movdqu xmm0, [eax] movdqu xmm0, [eax]
lea eax, [eax - 16] lea eax, [eax - 16]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
ret ret
} }
@ -2548,14 +2548,14 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
vmovdqa ymm5, kARGBShuffleMirror_AVX2 vmovdqu ymm5, kARGBShuffleMirror_AVX2
align 4 align 4
convertloop: convertloop:
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
sub ecx, 8
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -2773,25 +2773,6 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
} }
} }
#ifdef HAS_COPYROW_X86
// Multiple of 4.
__declspec(naked) __declspec(align(16))
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
__asm {
mov eax, esi
mov edx, edi
mov esi, [esp + 4] // src
mov edi, [esp + 8] // dst
mov ecx, [esp + 12] // count
shr ecx, 2
rep movsd
mov edi, edx
mov esi, eax
ret
}
}
#endif // HAS_COPYROW_X86
#ifdef HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
// width in pixels // width in pixels
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
@ -2998,9 +2979,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpand ymm1, ymm1, ymm5 vpand ymm1, ymm1, ymm5
vpackuswb ymm0, ymm0, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8 vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -3109,9 +3090,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
vpsrlw ymm1, ymm1, 8 vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm1 // mutates.
vpermq ymm0, ymm0, 0xd8 vpermq ymm0, ymm0, 0xd8
sub ecx, 32
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -3223,9 +3204,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pand xmm0, xmm5 // even bytes are Y pand xmm0, xmm5 // even bytes are Y
pand xmm1, xmm5 pand xmm1, xmm5
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -3328,9 +3309,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
psrlw xmm0, 8 // odd bytes are Y psrlw xmm0, 8 // odd bytes are Y
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
ret ret
} }
@ -3466,9 +3447,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge alignloop1 jge alignloop1
alignloop1b: alignloop1b:
@ -3497,9 +3478,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jge convertloop4 jge convertloop4
convertloop4b: convertloop4b:
@ -3528,9 +3509,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge convertloop1 jge convertloop1
convertloop1b: convertloop1b:
@ -3598,9 +3579,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge alignloop1 jge alignloop1
alignloop1b: alignloop1b:
@ -3627,9 +3608,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jge convertloop4 jge convertloop4
convertloop4b: convertloop4b:
@ -3656,9 +3637,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
paddusb xmm0, xmm2 // + src argb paddusb xmm0, xmm2 // + src argb
pand xmm1, xmm5 // a_g_ convert to 8 bits again pand xmm1, xmm5 // a_g_ convert to 8 bits again
paddusb xmm0, xmm1 // + src argb paddusb xmm0, xmm1 // + src argb
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge convertloop1 jge convertloop1
convertloop1b: convertloop1b:
@ -3701,9 +3682,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
packuswb xmm0, xmm1 packuswb xmm0, xmm1
pand xmm0, xmm5 // keep original alphas pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2 por xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
ret ret
@ -3750,9 +3731,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha por xmm0, xmm2 // copy original alpha
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
ret ret
@ -3790,9 +3771,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
vpsrlw ymm1, ymm1, 8 vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1 // unmutated. vpackuswb ymm0, ymm0, ymm1 // unmutated.
vpor ymm0, ymm0, ymm6 // copy original alpha vpor ymm0, ymm0, ymm6 // copy original alpha
sub ecx, 8
vmovdqu [eax + edx], ymm0 vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 8
jg convertloop jg convertloop
vzeroupper vzeroupper
@ -3839,9 +3820,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 16] lea eax, [eax + 16]
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
pop edi pop edi
pop esi pop esi
@ -3883,9 +3864,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated. vpackuswb ymm0, ymm0, ymm1 // unmutated.
sub ecx, 8
vmovdqu [eax + edx], ymm0 vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 8
jg convertloop jg convertloop
vzeroupper vzeroupper
@ -3945,9 +3926,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
vpackuswb ymm0, ymm0, ymm1 // unmutated. vpackuswb ymm0, ymm0, ymm1 // unmutated.
sub ecx, 8
vmovdqu [eax + edx], ymm0 vmovdqu [eax + edx], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 8
jg convertloop jg convertloop
pop edi pop edi
@ -3993,10 +3974,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklwd xmm0, xmm3 // GGGA first 4 punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4 punpckhwd xmm1, xmm3 // GGGA next 4
sub ecx, 8
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg convertloop jg convertloop
ret ret
} }
@ -4064,10 +4045,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm1, xmm0 // Weave BG, RA together movdqa xmm1, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm5 // BGRA first 4 punpcklwd xmm0, xmm5 // BGRA first 4
punpckhwd xmm1, xmm5 // BGRA next 4 punpckhwd xmm1, xmm5 // BGRA next 4
sub ecx, 8
movdqu [eax], xmm0 movdqu [eax], xmm0
movdqu [eax + 16], xmm1 movdqu [eax + 16], xmm1
lea eax, [eax + 32] lea eax, [eax + 32]
sub ecx, 8
jg convertloop jg convertloop
ret ret
} }
@ -4128,11 +4109,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movdqa xmm6, xmm0 // Weave BG, RA together movdqa xmm6, xmm0 // Weave BG, RA together
punpcklwd xmm0, xmm1 // BGRA first 4 punpcklwd xmm0, xmm1 // BGRA first 4
punpckhwd xmm6, xmm1 // BGRA next 4 punpckhwd xmm6, xmm1 // BGRA next 4
sub ecx, 8
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm6 movdqu [edx + 16], xmm6
lea eax, [eax + 32] lea eax, [eax + 32]
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg convertloop jg convertloop
ret ret
} }
@ -4176,9 +4157,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
paddw xmm1, xmm4 paddw xmm1, xmm4
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm7 por xmm0, xmm7
sub ecx, 4
movdqu [eax], xmm0 movdqu [eax], xmm0
lea eax, [eax + 16] lea eax, [eax + 16]
sub ecx, 4
jg convertloop jg convertloop
ret ret
} }
@ -4210,9 +4191,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm0, 8 psrlw xmm0, 8
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
ret ret
@ -4248,9 +4229,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
lea eax, [eax + 16] lea eax, [eax + 16]
lea esi, [esi + 16] lea esi, [esi + 16]
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
pop esi pop esi
@ -4282,9 +4263,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqu xmm1, [esi] // read 4 pixels from src_argb1 movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16] lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1 paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jge convertloop4 jge convertloop4
convertloop49: convertloop49:
@ -4297,9 +4278,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movd xmm1, [esi] // read 1 pixels from src_argb1 movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4] lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1 paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge convertloop1 jge convertloop1
convertloop19: convertloop19:
@ -4328,9 +4309,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
movdqu xmm1, [esi] // read 4 pixels from src_argb1 movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16] lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1 psubusb xmm0, xmm1 // src_argb0 - src_argb1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg convertloop jg convertloop
pop esi pop esi
@ -4482,9 +4463,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0 psubw xmm1, xmm0
pmaxsw xmm0, xmm1 pmaxsw xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0 movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8] lea eax, [eax + 8]
sub ecx, 8
jg convertloop jg convertloop
pop edi pop edi
@ -4536,9 +4517,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
psubw xmm1, xmm0 psubw xmm1, xmm0
pmaxsw xmm0, xmm1 pmaxsw xmm0, xmm1
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0 movq qword ptr [eax + edx], xmm0
lea eax, [eax + 8] lea eax, [eax + 8]
sub ecx, 8
jg convertloop jg convertloop
pop esi pop esi
@ -4585,12 +4566,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
punpckhwd xmm0, xmm0 // Last 4 punpckhwd xmm0, xmm0 // Last 4
por xmm3, xmm5 // GGGA por xmm3, xmm5 // GGGA
por xmm0, xmm5 por xmm0, xmm5
sub ecx, 16
movdqu [edx], xmm1 movdqu [edx], xmm1
movdqu [edx + 16], xmm2 movdqu [edx + 16], xmm2
movdqu [edx + 32], xmm3 movdqu [edx + 32], xmm3
movdqu [edx + 48], xmm0 movdqu [edx + 48], xmm0
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16
jg convertloop jg convertloop
pop esi pop esi
@ -4618,9 +4599,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16] lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely paddusb xmm0, xmm1 // sobel = sobelx + sobely
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg convertloop jg convertloop
pop esi pop esi
@ -4666,12 +4647,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
movdqa xmm7, xmm1 // YSXA movdqa xmm7, xmm1 // YSXA
punpcklwd xmm7, xmm0 // Next 4 punpcklwd xmm7, xmm0 // Next 4
punpckhwd xmm1, xmm0 // Last 4 punpckhwd xmm1, xmm0 // Last 4
sub ecx, 16
movdqu [edx], xmm6 movdqu [edx], xmm6
movdqu [edx + 16], xmm4 movdqu [edx + 16], xmm4
movdqu [edx + 32], xmm7 movdqu [edx + 32], xmm7
movdqu [edx + 48], xmm1 movdqu [edx + 48], xmm1
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16
jg convertloop jg convertloop
pop esi pop esi
@ -4983,9 +4964,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
movd xmm0, [eax + edi] // read pixel 3 movd xmm0, [eax + edi] // read pixel 3
punpckldq xmm6, xmm0 // combine pixel 2 and 3 punpckldq xmm6, xmm0 // combine pixel 2 and 3
addps xmm3, xmm4 // x, y += dx, dy next 2 addps xmm3, xmm4 // x, y += dx, dy next 2
sub ecx, 4
movq qword ptr 8[edx], xmm6 movq qword ptr 8[edx], xmm6
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jge l4 jge l4
l4b: l4b:
@ -5001,9 +4982,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
addps xmm2, xmm7 // x, y += dx, dy addps xmm2, xmm7 // x, y += dx, dy
movd esi, xmm0 movd esi, xmm0
movd xmm0, [eax + esi] // copy a pixel movd xmm0, [eax + esi] // copy a pixel
sub ecx, 1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1
jge l1 jge l1
l1b: l1b:
pop edi pop edi
@ -5059,9 +5040,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vpsrlw ymm0, ymm0, 7 vpsrlw ymm0, ymm0, 7
vpsrlw ymm1, ymm1, 7 vpsrlw ymm1, ymm1, 7
vpackuswb ymm0, ymm0, ymm1 // unmutates vpackuswb ymm0, ymm0, ymm1 // unmutates
sub ecx, 32
vmovdqu [esi + edi], ymm0 vmovdqu [esi + edi], ymm0
lea esi, [esi + 32] lea esi, [esi + 32]
sub ecx, 32
jg xloop jg xloop
jmp xloop99 jmp xloop99
@ -5072,9 +5053,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vmovdqu ymm1, [esi + edx] vmovdqu ymm1, [esi + edx]
vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1
sub ecx, 32
vmovdqu [esi + edi], ymm0 vmovdqu [esi + edi], ymm0
lea esi, [esi + 32] lea esi, [esi + 32]
sub ecx, 32
jg xloop25 jg xloop25
jmp xloop99 jmp xloop99
@ -5083,9 +5064,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
xloop50: xloop50:
vmovdqu ymm0, [esi] vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx] vpavgb ymm0, ymm0, [esi + edx]
sub ecx, 32
vmovdqu [esi + edi], ymm0 vmovdqu [esi + edi], ymm0
lea esi, [esi + 32] lea esi, [esi + 32]
sub ecx, 32
jg xloop50 jg xloop50
jmp xloop99 jmp xloop99
@ -5096,9 +5077,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
vmovdqu ymm0, [esi + edx] vmovdqu ymm0, [esi + edx]
vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1
vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1
sub ecx, 32
vmovdqu [esi + edi], ymm0 vmovdqu [esi + edi], ymm0
lea esi, [esi + 32] lea esi, [esi + 32]
sub ecx, 32
jg xloop75 jg xloop75
jmp xloop99 jmp xloop99
@ -5161,9 +5142,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
psrlw xmm0, 7 psrlw xmm0, 7
psrlw xmm1, 7 psrlw xmm1, 7
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop jg xloop
jmp xloop99 jmp xloop99
@ -5174,9 +5155,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop25 jg xloop25
jmp xloop99 jmp xloop99
@ -5186,9 +5167,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop50 jg xloop50
jmp xloop99 jmp xloop99
@ -5199,9 +5180,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi + edx] movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop75 jg xloop75
jmp xloop99 jmp xloop99
@ -5209,9 +5190,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
align 4 align 4
xloop100: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop100 jg xloop100
xloop99: xloop99:
@ -5273,9 +5254,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
paddw xmm0, xmm2 // sum rows paddw xmm0, xmm2 // sum rows
paddw xmm1, xmm3 paddw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop jg xloop
jmp xloop99 jmp xloop99
@ -5286,9 +5267,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop25 jg xloop25
jmp xloop99 jmp xloop99
@ -5298,9 +5279,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi] movdqu xmm0, [esi]
movdqu xmm1, [esi + edx] movdqu xmm1, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop50 jg xloop50
jmp xloop99 jmp xloop99
@ -5311,9 +5292,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqu xmm0, [esi + edx] movdqu xmm0, [esi + edx]
pavgb xmm0, xmm1 pavgb xmm0, xmm1
pavgb xmm0, xmm1 pavgb xmm0, xmm1
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop75 jg xloop75
jmp xloop99 jmp xloop99
@ -5321,9 +5302,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
align 4 align 4
xloop100: xloop100:
movdqu xmm0, [esi] movdqu xmm0, [esi]
sub ecx, 16
movdqu [esi + edi], xmm0 movdqu [esi + edi], xmm0
lea esi, [esi + 16] lea esi, [esi + 16]
sub ecx, 16
jg xloop100 jg xloop100
xloop99: xloop99:
@ -5352,9 +5333,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
pshufb xmm0, xmm5 pshufb xmm0, xmm5
pshufb xmm1, xmm5 pshufb xmm1, xmm5
punpckldq xmm0, xmm1 punpckldq xmm0, xmm1
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
ret ret
} }
@ -5383,9 +5364,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
pand xmm1, xmm5 pand xmm1, xmm5
packssdw xmm0, xmm1 packssdw xmm0, xmm1
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
ret ret
} }
@ -5409,10 +5390,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 32] lea eax, [eax + 32]
pshufb xmm0, xmm5 pshufb xmm0, xmm5
pshufb xmm1, xmm5 pshufb xmm1, xmm5
sub ecx, 8
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg wloop jg wloop
ret ret
} }
@ -5436,10 +5417,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 64] lea eax, [eax + 64]
vpshufb ymm0, ymm0, ymm5 vpshufb ymm0, ymm0, ymm5
vpshufb ymm1, ymm1, ymm5 vpshufb ymm1, ymm1, ymm5
sub ecx, 16
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
vmovdqu [edx + 32], ymm1 vmovdqu [edx + 32], ymm1
lea edx, [edx + 64] lea edx, [edx + 64]
sub ecx, 16
jg wloop jg wloop
vzeroupper vzeroupper
@ -5502,9 +5483,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 01Bh pshufhw xmm1, xmm1, 01Bh
pshuflw xmm1, xmm1, 01Bh pshuflw xmm1, xmm1, 01Bh
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg shuf_0123 jg shuf_0123
jmp shuf99 jmp shuf99
@ -5520,9 +5501,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 039h pshufhw xmm1, xmm1, 039h
pshuflw xmm1, xmm1, 039h pshuflw xmm1, xmm1, 039h
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg shuf_0321 jg shuf_0321
jmp shuf99 jmp shuf99
@ -5538,9 +5519,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 093h pshufhw xmm1, xmm1, 093h
pshuflw xmm1, xmm1, 093h pshuflw xmm1, xmm1, 093h
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg shuf_2103 jg shuf_2103
jmp shuf99 jmp shuf99
@ -5556,9 +5537,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshufhw xmm1, xmm1, 0C6h pshufhw xmm1, xmm1, 0C6h
pshuflw xmm1, xmm1, 0C6h pshuflw xmm1, xmm1, 0C6h
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg shuf_3012 jg shuf_3012
shuf99: shuf99:
@ -5700,9 +5681,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
cvttps2dq xmm4, xmm4 cvttps2dq xmm4, xmm4
packuswb xmm0, xmm4 packuswb xmm0, xmm4
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 2
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 2
jg convertloop jg convertloop
pop esi pop esi
ret ret
@ -5740,9 +5721,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
sub ecx, 2
vmovq qword ptr [edx], xmm0 vmovq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 2
jg convertloop jg convertloop
vzeroupper vzeroupper
ret ret
@ -5905,9 +5886,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
movzx edx, byte ptr [eax + 15] // copy alpha. movzx edx, byte ptr [eax + 15] // copy alpha.
mov byte ptr [edi + 15], dl mov byte ptr [edi + 15], dl
sub ecx, 4
lea eax, [eax + 16] lea eax, [eax + 16]
lea edi, [edi + 16] lea edi, [edi + 16]
sub ecx, 4
jg convertloop jg convertloop
pop edi pop edi

View File

@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm0,%%xmm1 \n" "paddusw %%xmm0,%%xmm1 \n"
"pmulhuw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n"
"sub $0x6,%2 \n"
"movd %%xmm1," MEMACCESS(1) " \n" "movd %%xmm1," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm1 \n" "psrlq $0x10,%%xmm1 \n"
"movd %%xmm1," MEMACCESS2(0x2,1) " \n" "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n" "lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
"paddusw %%xmm7,%%xmm6 \n" "paddusw %%xmm7,%%xmm6 \n"
"pmulhuw %%xmm4,%%xmm6 \n" "pmulhuw %%xmm4,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n"
"sub $0x6,%2 \n"
"movd %%xmm6," MEMACCESS(1) " \n" "movd %%xmm6," MEMACCESS(1) " \n"
"psrlq $0x10,%%xmm6 \n" "psrlq $0x10,%%xmm6 \n"
"movd %%xmm6," MEMACCESS2(0x2,1) " \n" "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
"lea " MEMLEA(0x6,1) ",%1 \n" "lea " MEMLEA(0x6,1) ",%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n"
"sub $0x20,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_ptr), // %0 : "+r"(dst_ptr), // %0
@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"shufps $0xdd,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n" "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n" "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
"punpckldq %%xmm3,%%xmm2 \n" "punpckldq %%xmm3,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm0 \n" "punpcklqdq %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
"shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm2 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n"
"pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm2,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(src_stepx_x4), // %1 "+r"(src_stepx_x4), // %1
@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
"pextrw $0x3,%%xmm2,%k1 \n" "pextrw $0x3,%%xmm2,%k1 \n"
"punpckldq %%xmm4,%%xmm1 \n" "punpckldq %%xmm4,%%xmm1 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" "punpcklqdq %%xmm1,%%xmm0 \n"
"sub $0x4,%4 \n"
"movdqu %%xmm0," MEMACCESS(2) " \n" "movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%4 \n"
"jge 40b \n" "jge 40b \n"
"49: \n" "49: \n"
@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"punpckldq %%xmm0,%%xmm0 \n" "punpckldq %%xmm0,%%xmm0 \n"
"punpckhdq %%xmm1,%%xmm1 \n" "punpckhdq %%xmm1,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0

View File

@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
psrlw xmm0, 8 // isolate odd pixels. psrlw xmm0, 8 // isolate odd pixels.
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
ret ret
@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
ret ret
@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm1, xmm3 pavgw xmm1, xmm3
packuswb xmm0, xmm1 packuswb xmm0, xmm1
sub ecx, 16
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 16
jg wloop jg wloop
pop esi pop esi
@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
packuswb xmm0, xmm1 packuswb xmm0, xmm1
psrlw xmm0, 8 psrlw xmm0, 8
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
ret ret
@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
pavgw xmm0, xmm2 pavgw xmm0, xmm2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [edx], xmm0 movq qword ptr [edx], xmm0
lea edx, [edx + 8] lea edx, [edx + 8]
sub ecx, 8
jg wloop jg wloop
pop edi pop edi
@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx + 24] lea edx, [edx + 24]
sub ecx, 24
jg wloop jg wloop
pop esi pop esi
@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
paddsw xmm0, xmm7 paddsw xmm0, xmm7
psrlw xmm0, 2 psrlw xmm0, 2
packuswb xmm0, xmm0 packuswb xmm0, xmm0
sub ecx, 24
movq qword ptr [edx + 16], xmm0 movq qword ptr [edx + 16], xmm0
lea edx, [edx+24] lea edx, [edx+24]
sub ecx, 24
jg wloop jg wloop
pop esi pop esi
@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
pshufb xmm1, xmm5 pshufb xmm1, xmm5
paddusb xmm0, xmm1 paddusb xmm0, xmm1
sub ecx, 12
movq qword ptr [edx], xmm0 // write 12 pixels movq qword ptr [edx], xmm0 // write 12 pixels
movhlps xmm1, xmm0 movhlps xmm1, xmm0
movd [edx + 8], xmm1 movd [edx + 8], xmm1
lea edx, [edx + 12] lea edx, [edx + 12]
sub ecx, 12
jg xloop jg xloop
ret ret
@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
packuswb xmm6, xmm6 packuswb xmm6, xmm6
sub ecx, 6
movd [edx], xmm6 // write 6 pixels movd [edx], xmm6 // write 6 pixels
psrlq xmm6, 16 psrlq xmm6, 16
movd [edx + 2], xmm6 movd [edx + 2], xmm6
lea edx, [edx + 6] lea edx, [edx + 6]
sub ecx, 6
jg xloop jg xloop
pop esi pop esi
@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
packuswb xmm1, xmm1 packuswb xmm1, xmm1
sub ecx, 6
movd [edx], xmm1 // write 6 pixels movd [edx], xmm1 // write 6 pixels
psrlq xmm1, 16 psrlq xmm1, 16
movd [edx + 2], xmm1 movd [edx + 2], xmm1
lea edx, [edx + 6] lea edx, [edx + 6]
sub ecx, 6
jg xloop jg xloop
pop esi pop esi
@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 punpcklbw xmm0, xmm0
punpckhbw xmm1, xmm1 punpckhbw xmm1, xmm1
sub ecx, 32
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 32
jg wloop jg wloop
ret ret
@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
lea eax, [eax + 32] lea eax, [eax + 32]
shufps xmm0, xmm1, 0xdd shufps xmm0, xmm1, 0xdd
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
ret ret
@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
ret ret
@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop esi pop esi
@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
lea eax, [eax + ebx * 4] lea eax, [eax + ebx * 4]
punpckldq xmm2, xmm3 punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2 punpcklqdq xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop edi pop edi
@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
shufps xmm0, xmm1, 0x88 // even pixels shufps xmm0, xmm1, 0x88 // even pixels
shufps xmm2, xmm1, 0xdd // odd pixels shufps xmm2, xmm1, 0xdd // odd pixels
pavgb xmm0, xmm2 pavgb xmm0, xmm2
sub ecx, 4
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4
jg wloop jg wloop
pop edi pop edi
@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
pextrw edx, xmm2, 3 // get x1 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration.
punpckldq xmm1, xmm4 // x2 x3 punpckldq xmm1, xmm4 // x2 x3
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
sub ecx, 4 // 4 pixels
movdqu [edi], xmm0 movdqu [edi], xmm0
lea edi, [edi + 16] lea edi, [edi + 16]
sub ecx, 4 // 4 pixels
jge xloop4 jge xloop4
align 4 align 4
@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpckldq xmm0, xmm0 punpckldq xmm0, xmm0
punpckhdq xmm1, xmm1 punpckhdq xmm1, xmm1
sub ecx, 8
movdqu [edx], xmm0 movdqu [edx], xmm0
movdqu [edx + 16], xmm1 movdqu [edx + 16], xmm1
lea edx, [edx + 32] lea edx, [edx + 32]
sub ecx, 8
jg wloop jg wloop
ret ret