mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Move sub before branch for loops.
Remove CopyRow_x86 Add CopyRow_Any versions for AVX, SSE2 and Neon. BUG=269 TESTED=local build R=harryjin@google.com, tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/26209004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
813bf9f97d
commit
91f240c5db
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1174
|
||||
Version: 1175
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -111,7 +111,6 @@ extern "C" {
|
||||
#define HAS_BGRATOYROW_SSSE3
|
||||
#define HAS_COPYROW_ERMS
|
||||
#define HAS_COPYROW_SSE2
|
||||
#define HAS_COPYROW_X86
|
||||
#define HAS_I400TOARGBROW_SSE2
|
||||
#define HAS_I411TOARGBROW_SSSE3
|
||||
#define HAS_I422TOARGB1555ROW_SSSE3
|
||||
@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_AVX(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_X86(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
|
||||
|
||||
void CopyRow_16_C(const uint16* src, uint16* dst, int count);
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1174
|
||||
#define LIBYUV_VERSION 1175
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
"lea " MEMLEA(0x10, 0) ",%0 \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
|
||||
"lea " MEMLEA(0x10, 1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"psubusb %%xmm2,%%xmm1 \n"
|
||||
"psubusb %%xmm3,%%xmm2 \n"
|
||||
@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
"pmaddwd %%xmm2,%%xmm2 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"paddd %%xmm2,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"pshufd $0xee,%%xmm0,%%xmm1 \n"
|
||||
@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
"pmulld %%xmm5,%%xmm1 \n"
|
||||
"paddd %%xmm4,%%xmm3 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"sub $0x10,%1 \n"
|
||||
"paddd %%xmm3,%%xmm1 \n"
|
||||
"pshufd $0xe,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"pshufd $0x1,%%xmm1,%%xmm2 \n"
|
||||
"paddd %%xmm2,%%xmm1 \n"
|
||||
"paddd %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%1 \n"
|
||||
"jg 1b \n"
|
||||
"movd %%xmm0,%3 \n"
|
||||
: "+r"(src), // %0
|
||||
|
||||
@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm2, [edx]
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
movdqa xmm3, xmm1 // abs trick
|
||||
psubusb xmm1, xmm2
|
||||
psubusb xmm2, xmm3
|
||||
@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
pmaddwd xmm2, xmm2
|
||||
paddd xmm0, xmm1
|
||||
paddd xmm0, xmm2
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pshufd xmm1, xmm0, 0xee
|
||||
@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
vmovdqu ymm1, [eax]
|
||||
vmovdqu ymm2, [eax + edx]
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 32
|
||||
vpsubusb ymm3, ymm1, ymm2 // abs difference trick
|
||||
vpsubusb ymm2, ymm2, ymm1
|
||||
vpor ymm1, ymm2, ymm3
|
||||
@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
vpmaddwd ymm1, ymm1, ymm1
|
||||
vpaddd ymm0, ymm0, ymm1
|
||||
vpaddd ymm0, ymm0, ymm2
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
|
||||
@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
pmulld(0xcd) // pmulld xmm1, xmm5
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
sub ecx, 16
|
||||
paddd xmm1, xmm3
|
||||
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
|
||||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
|
||||
pmulld xmm1, kHashMul3
|
||||
paddd xmm3, xmm4 // add 16 results
|
||||
paddd xmm1, xmm2
|
||||
sub ecx, 16
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 0x0e // upper 2 dwords
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0x01
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm0, xmm1
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
movd eax, xmm0 // return hash
|
||||
|
||||
@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
|
||||
int width, int height) {
|
||||
int y;
|
||||
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
|
||||
#if defined(HAS_COPYROW_X86)
|
||||
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
|
||||
CopyRow = CopyRow_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
|
||||
CopyRow = CopyRow_AVX;
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
|
||||
dst_stride_v = -dst_stride_v;
|
||||
}
|
||||
// CopyRow for rows of just Y in Q420 copied to Y plane of I420.
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_X86)
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
CopyRow = CopyRow_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
|
||||
CopyRow = CopyRow_AVX;
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_YUY2TOYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
|
||||
|
||||
@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
|
||||
if (src_y == dst_y && src_stride_y == dst_stride_y) {
|
||||
return;
|
||||
}
|
||||
#if defined(HAS_COPYROW_X86)
|
||||
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
|
||||
CopyRow = CopyRow_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
|
||||
CopyRow = CopyRow_AVX;
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_NEON;
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_y = 0;
|
||||
}
|
||||
#if defined(HAS_COPYROW_16_X86)
|
||||
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
|
||||
CopyRow = CopyRow_16_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_16_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_16_SSE2;
|
||||
|
||||
@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride,
|
||||
MirrorRow = MirrorRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_X86)
|
||||
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
|
||||
CopyRow = CopyRow_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
|
||||
CopyRow = CopyRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
|
||||
CopyRow = CopyRow_AVX;
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride,
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
|
||||
@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
|
||||
CopyRow = CopyRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_X86)
|
||||
if (TestCpuFlag(kCpuHasX86)) {
|
||||
CopyRow = CopyRow_X86;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) {
|
||||
CopyRow = CopyRow_SSE2;
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) {
|
||||
CopyRow = CopyRow_AVX;
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
|
||||
CopyRow = CopyRow_ERMS;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
|
||||
@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
|
||||
#endif
|
||||
#undef NANY
|
||||
|
||||
|
||||
|
||||
#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
|
||||
int n = width & ~MASK; \
|
||||
@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
|
||||
#endif
|
||||
#undef MANY
|
||||
|
||||
#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \
|
||||
int n = width & ~MASK; \
|
||||
int r = width & MASK; \
|
||||
if (n > 0) { \
|
||||
COPY_SIMD(src_y, dst_y, n); \
|
||||
} \
|
||||
COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_COPYROW_AVX
|
||||
MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_COPYROW_NEON
|
||||
MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
|
||||
#endif
|
||||
#undef MANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -296,9 +296,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
"pshufb %%xmm4,%%xmm3 \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"por %%xmm5,%%xmm3 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
|
||||
"lea " MEMLEA(0x40,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -337,9 +337,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
"pshufb %%xmm4,%%xmm3 \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"por %%xmm5,%%xmm3 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
|
||||
"lea " MEMLEA(0x40,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -725,9 +725,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -765,9 +765,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -837,10 +837,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -910,10 +910,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
"psraw $0x8,%%xmm0 \n"
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -961,7 +961,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"psraw $0x8,%%xmm2 \n"
|
||||
"packsswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
@ -980,6 +979,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1038,10 +1038,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1080,9 +1080,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1145,10 +1145,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_bgra0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1186,9 +1186,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1223,9 +1223,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1288,10 +1288,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_abgr0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1357,10 +1357,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movlps %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_rgba0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -2186,9 +2186,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
"1: \n"
|
||||
MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2215,9 +2215,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
@ -2249,9 +2249,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
|
||||
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
|
||||
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1)",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2285,10 +2285,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"lea " MEMLEA(-0x10,0) ",%0 \n"
|
||||
"pshufb %%xmm1,%%xmm0 \n"
|
||||
"sub $8,%3 \n"
|
||||
"movlpd %%xmm0," MEMACCESS(1) " \n"
|
||||
MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $8,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -2322,9 +2322,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"lea " MEMLEA(-0x10,0) ",%0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2346,13 +2346,13 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = {
|
||||
void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
intptr_t temp_width = (intptr_t)(width);
|
||||
asm volatile (
|
||||
"vmovdqa %3,%%ymm5 \n"
|
||||
"vmovdqu %3,%%ymm5 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
|
||||
"sub $0x20,%2 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
@ -2574,21 +2574,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
|
||||
}
|
||||
#endif // HAS_COPYROW_AVX
|
||||
|
||||
#ifdef HAS_COPYROW_X86
|
||||
void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
||||
size_t width_tmp = (size_t)(width);
|
||||
asm volatile (
|
||||
"shr $0x2,%2 \n"
|
||||
"rep movsl " MEMMOVESTRING(0,1) " \n"
|
||||
: "+S"(src), // %0
|
||||
"+D"(dst), // %1
|
||||
"+c"(width_tmp) // %2
|
||||
:
|
||||
: "memory", "cc"
|
||||
);
|
||||
}
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_COPYROW_ERMS
|
||||
// Multiple of 1.
|
||||
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
|
||||
@ -2894,9 +2879,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -3006,9 +2991,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
@ -3119,9 +3104,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
||||
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
@ -3263,9 +3248,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
|
||||
"19: \n"
|
||||
@ -3295,9 +3280,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jge 41b \n"
|
||||
|
||||
"49: \n"
|
||||
@ -3326,9 +3311,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 91b \n"
|
||||
"99: \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
@ -3398,9 +3383,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 10b \n"
|
||||
|
||||
"19: \n"
|
||||
@ -3428,9 +3413,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jge 40b \n"
|
||||
|
||||
"49: \n"
|
||||
@ -3457,9 +3442,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x4,2) ",%2 \n"
|
||||
"sub $0x1,%3 \n"
|
||||
"jge 91b \n"
|
||||
"99: \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
@ -3505,9 +3490,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -3558,9 +3543,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -3603,9 +3588,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpor %%ymm6,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -3651,9 +3636,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
"pmulhuw %%xmm2,%%xmm1 \n"
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -3723,9 +3708,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -3776,10 +3761,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklwd %%xmm3,%%xmm0 \n"
|
||||
"punpckhwd %%xmm3,%%xmm1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -3853,10 +3838,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklwd %%xmm5,%%xmm0 \n"
|
||||
"punpckhwd %%xmm5,%%xmm1 \n"
|
||||
"sub $0x8,%1 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"sub $0x8,%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
@ -3919,11 +3904,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
"movdqa %%xmm0,%%xmm6 \n"
|
||||
"punpcklwd %%xmm1,%%xmm0 \n"
|
||||
"punpckhwd %%xmm1,%%xmm6 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -3972,9 +3957,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
"paddw %%xmm4,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"por %%xmm7,%%xmm0 \n"
|
||||
"sub $0x4,%1 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||
"sub $0x4,%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
@ -4011,9 +3996,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -4050,9 +4035,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"pmulhuw %%xmm2,%%xmm0 \n"
|
||||
"pmulhuw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4119,9 +4104,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4179,9 +4164,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"psubusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4264,9 +4249,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
||||
"psubw %%xmm0,%%xmm1 \n"
|
||||
"pmaxsw %%xmm1,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"sub $0x8,%4 \n"
|
||||
MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
|
||||
"lea " MEMLEA(0x8,0) ",%0 \n"
|
||||
"sub $0x8,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y0), // %0
|
||||
"+r"(src_y1), // %1
|
||||
@ -4322,9 +4307,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
||||
"psubw %%xmm0,%%xmm1 \n"
|
||||
"pmaxsw %%xmm1,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
|
||||
"lea " MEMLEA(0x8,0) ",%0 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y0), // %0
|
||||
"+r"(src_y1), // %1
|
||||
@ -4375,12 +4360,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"punpckhwd %%xmm0,%%xmm0 \n"
|
||||
"por %%xmm5,%%xmm3 \n"
|
||||
"por %%xmm5,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movdqu %%xmm1," MEMACCESS(2) " \n"
|
||||
"movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
|
||||
"movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
|
||||
"movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
|
||||
"lea " MEMLEA(0x40,2) ",%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -4414,9 +4399,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -4466,12 +4451,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"movdqa %%xmm1,%%xmm7 \n"
|
||||
"punpcklwd %%xmm0,%%xmm7 \n"
|
||||
"punpckhwd %%xmm0,%%xmm1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"movdqu %%xmm6," MEMACCESS(2) " \n"
|
||||
"movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
|
||||
"movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
|
||||
"lea " MEMLEA(0x40,2) ",%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -4757,9 +4742,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
|
||||
"punpckldq %%xmm6,%%xmm0 \n"
|
||||
"addps %%xmm4,%%xmm3 \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"movq %%xmm0," MEMACCESS2(0x08,2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"jge 40b \n"
|
||||
|
||||
"49: \n"
|
||||
@ -4775,9 +4760,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
"addps %%xmm7,%%xmm2 \n"
|
||||
"movd %%xmm0,%k1 \n"
|
||||
MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
|
||||
"sub $0x1,%4 \n"
|
||||
"movd %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x04,2) ",%2 \n"
|
||||
"sub $0x1,%4 \n"
|
||||
"jge 10b \n"
|
||||
"19: \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -4836,9 +4821,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"psrlw $0x7,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4849,9 +4834,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm1)
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 25b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4861,9 +4846,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm1)
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 50b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4874,9 +4859,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm0)
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 75b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4884,9 +4869,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
LABELALIGN
|
||||
"100: \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 100b \n"
|
||||
|
||||
"99: \n"
|
||||
@ -4952,9 +4937,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4965,9 +4950,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 25b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4977,9 +4962,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 50b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -4990,9 +4975,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"pavgb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 75b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -5000,9 +4985,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
LABELALIGN
|
||||
"100: \n"
|
||||
"movdqu " MEMACCESS(1) ",%%xmm0 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 100b \n"
|
||||
|
||||
"99: \n"
|
||||
@ -5037,9 +5022,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"pshufb %%xmm5,%%xmm1 \n"
|
||||
"punpckldq %%xmm1,%%xmm0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
@ -5070,9 +5055,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"packssdw %%xmm1,%%xmm0 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
@ -5099,10 +5084,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"pshufb %%xmm5,%%xmm1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -5129,10 +5114,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
"lea " MEMLEA(0x40,0) ",%0 \n"
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"vmovdqu %%ymm0," MEMACCESS(1) " \n"
|
||||
"vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
|
||||
"lea " MEMLEA(0x40,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -5196,9 +5181,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
"pshufhw $0x1b,%%xmm1,%%xmm1 \n"
|
||||
"pshuflw $0x1b,%%xmm1,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 123b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -5214,9 +5199,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
"pshufhw $0x39,%%xmm1,%%xmm1 \n"
|
||||
"pshuflw $0x39,%%xmm1,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 321b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -5232,9 +5217,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
"pshufhw $0x93,%%xmm1,%%xmm1 \n"
|
||||
"pshuflw $0x93,%%xmm1,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 2103b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
@ -5250,9 +5235,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
"pshufhw $0xc6,%%xmm1,%%xmm1 \n"
|
||||
"pshuflw $0xc6,%%xmm1,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 3012b \n"
|
||||
|
||||
"99: \n"
|
||||
@ -5394,9 +5379,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
||||
"cvttps2dq %%xmm4,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"sub $0x2,%2 \n"
|
||||
"movq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x2,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -5435,9 +5420,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
"vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
|
||||
"sub $0x2,%2 \n"
|
||||
"vmovq %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x8,1) ",%1 \n"
|
||||
"sub $0x2,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -5597,9 +5582,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
"mov %b0," MEMACCESS2(0xe,3) " \n"
|
||||
"movzb " MEMACCESS2(0xf,2) ",%0 \n"
|
||||
"mov %b0," MEMACCESS2(0xf,3) " \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"lea " MEMLEA(0x10,3) ",%3 \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+d"(pixel_temp), // %0
|
||||
"+a"(table_temp), // %1
|
||||
|
||||
@ -313,9 +313,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
pshufb xmm3, xmm4
|
||||
movdqu [edx + 16], xmm1
|
||||
por xmm3, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -353,9 +353,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
pshufb xmm3, xmm4
|
||||
movdqu [edx + 16], xmm1
|
||||
por xmm3, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -728,9 +728,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -764,9 +764,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -782,7 +782,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vbroadcastf128 ymm4, kARGBToY
|
||||
vbroadcastf128 ymm5, kAddY16
|
||||
vmovdqa ymm6, kPermdARGBToY_AVX
|
||||
vmovdqu ymm6, kPermdARGBToY_AVX
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
@ -802,9 +802,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
vpackuswb ymm0, ymm0, ymm2 // mutates.
|
||||
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
||||
vpaddb ymm0, ymm0, ymm5
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -822,7 +822,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vbroadcastf128 ymm4, kARGBToYJ
|
||||
vbroadcastf128 ymm5, kAddYJ64
|
||||
vmovdqa ymm6, kPermdARGBToY_AVX
|
||||
vmovdqu ymm6, kPermdARGBToY_AVX
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
@ -843,9 +843,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
vpsrlw ymm2, ymm2, 7
|
||||
vpackuswb ymm0, ymm0, ymm2 // mutates.
|
||||
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
|
||||
vzeroupper
|
||||
@ -880,9 +880,9 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -914,9 +914,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -948,9 +948,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -1015,10 +1015,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1087,10 +1087,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
packsswb xmm0, xmm1
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1152,10 +1152,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||
vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
sub ecx, 32
|
||||
vextractf128 [edx], ymm0, 0 // U
|
||||
vextractf128 [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1197,7 +1197,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
||||
psraw xmm2, 8
|
||||
packsswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
|
||||
movdqu xmm0, [eax] // V
|
||||
@ -1217,6 +1216,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
||||
lea eax, [eax + 64]
|
||||
movdqu [edx + edi], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1272,10 +1272,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1342,10 +1342,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1413,10 +1413,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -1484,10 +1484,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -2043,9 +2043,9 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
||||
por xmm3, xmm2 // BG
|
||||
por xmm1, xmm3 // BGR
|
||||
packssdw xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movdqu [edx], xmm0 // store 8 pixels of RGB565
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -2411,9 +2411,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
convertloop:
|
||||
movdqu xmm0, [eax - 16 + ecx]
|
||||
pshufb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -2434,9 +2434,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
vmovdqu ymm0, [eax - 32 + ecx]
|
||||
vpshufb ymm0, ymm0, ymm5
|
||||
vpermq ymm0, ymm0, 0x4e // swap high and low halfs
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -2462,9 +2462,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
pshuflw xmm0, xmm0, 0x1b // swap words
|
||||
pshufhw xmm0, xmm0, 0x1b
|
||||
pshufd xmm0, xmm0, 0x4e // swap qwords
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -2495,10 +2495,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
movdqu xmm0, [eax]
|
||||
lea eax, [eax - 16]
|
||||
pshufb xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [edx + edi], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -2527,9 +2527,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
movdqu xmm0, [eax]
|
||||
lea eax, [eax - 16]
|
||||
pshufb xmm0, xmm5
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -2548,14 +2548,14 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
mov eax, [esp + 4] // src
|
||||
mov edx, [esp + 8] // dst
|
||||
mov ecx, [esp + 12] // width
|
||||
vmovdqa ymm5, kARGBShuffleMirror_AVX2
|
||||
vmovdqu ymm5, kARGBShuffleMirror_AVX2
|
||||
|
||||
align 4
|
||||
convertloop:
|
||||
vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
|
||||
sub ecx, 8
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -2773,25 +2773,6 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_COPYROW_X86
|
||||
// Multiple of 4.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
||||
__asm {
|
||||
mov eax, esi
|
||||
mov edx, edi
|
||||
mov esi, [esp + 4] // src
|
||||
mov edi, [esp + 8] // dst
|
||||
mov ecx, [esp + 12] // count
|
||||
shr ecx, 2
|
||||
rep movsd
|
||||
mov edi, edx
|
||||
mov esi, eax
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_ARGBCOPYALPHAROW_SSE2
|
||||
// width in pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -2998,9 +2979,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
|
||||
vpand ymm1, ymm1, ymm5
|
||||
vpackuswb ymm0, ymm0, ymm1 // mutates.
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -3109,9 +3090,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
|
||||
vpsrlw ymm1, ymm1, 8
|
||||
vpackuswb ymm0, ymm0, ymm1 // mutates.
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -3223,9 +3204,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
||||
pand xmm0, xmm5 // even bytes are Y
|
||||
pand xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -3328,9 +3309,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
||||
psrlw xmm0, 8 // odd bytes are Y
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -3466,9 +3447,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge alignloop1
|
||||
|
||||
alignloop1b:
|
||||
@ -3497,9 +3478,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jge convertloop4
|
||||
|
||||
convertloop4b:
|
||||
@ -3528,9 +3509,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge convertloop1
|
||||
|
||||
convertloop1b:
|
||||
@ -3598,9 +3579,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge alignloop1
|
||||
|
||||
alignloop1b:
|
||||
@ -3627,9 +3608,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jge convertloop4
|
||||
|
||||
convertloop4b:
|
||||
@ -3656,9 +3637,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge convertloop1
|
||||
|
||||
convertloop1b:
|
||||
@ -3701,9 +3682,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
packuswb xmm0, xmm1
|
||||
pand xmm0, xmm5 // keep original alphas
|
||||
por xmm0, xmm2
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
@ -3750,9 +3731,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
por xmm0, xmm2 // copy original alpha
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
@ -3790,9 +3771,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
vpsrlw ymm1, ymm1, 8
|
||||
vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
||||
vpor ymm0, ymm0, ymm6 // copy original alpha
|
||||
sub ecx, 8
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
vzeroupper
|
||||
@ -3839,9 +3820,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
lea eax, [eax + 16]
|
||||
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
@ -3883,9 +3864,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
|
||||
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
|
||||
vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
||||
sub ecx, 8
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
vzeroupper
|
||||
@ -3945,9 +3926,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
|
||||
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
|
||||
vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
||||
sub ecx, 8
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -3993,10 +3974,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm3 // GGGA first 4
|
||||
punpckhwd xmm1, xmm3 // GGGA next 4
|
||||
sub ecx, 8
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -4064,10 +4045,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
||||
movdqa xmm1, xmm0 // Weave BG, RA together
|
||||
punpcklwd xmm0, xmm5 // BGRA first 4
|
||||
punpckhwd xmm1, xmm5 // BGRA next 4
|
||||
sub ecx, 8
|
||||
movdqu [eax], xmm0
|
||||
movdqu [eax + 16], xmm1
|
||||
lea eax, [eax + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -4128,11 +4109,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
movdqa xmm6, xmm0 // Weave BG, RA together
|
||||
punpcklwd xmm0, xmm1 // BGRA first 4
|
||||
punpckhwd xmm6, xmm1 // BGRA next 4
|
||||
sub ecx, 8
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm6
|
||||
lea eax, [eax + 32]
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -4176,9 +4157,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
paddw xmm1, xmm4
|
||||
packuswb xmm0, xmm1
|
||||
por xmm0, xmm7
|
||||
sub ecx, 4
|
||||
movdqu [eax], xmm0
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
@ -4210,9 +4191,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
psrlw xmm0, 8
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
@ -4248,9 +4229,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
lea eax, [eax + 16]
|
||||
lea esi, [esi + 16]
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4282,9 +4263,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||
lea esi, [esi + 16]
|
||||
paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jge convertloop4
|
||||
|
||||
convertloop49:
|
||||
@ -4297,9 +4278,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
movd xmm1, [esi] // read 1 pixels from src_argb1
|
||||
lea esi, [esi + 4]
|
||||
paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge convertloop1
|
||||
|
||||
convertloop19:
|
||||
@ -4328,9 +4309,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||
lea esi, [esi + 16]
|
||||
psubusb xmm0, xmm1 // src_argb0 - src_argb1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4482,9 +4463,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
||||
psubw xmm1, xmm0
|
||||
pmaxsw xmm0, xmm1
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 8
|
||||
movq qword ptr [eax + edx], xmm0
|
||||
lea eax, [eax + 8]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
@ -4536,9 +4517,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
|
||||
psubw xmm1, xmm0
|
||||
pmaxsw xmm0, xmm1
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 8
|
||||
movq qword ptr [eax + edx], xmm0
|
||||
lea eax, [eax + 8]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4585,12 +4566,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
punpckhwd xmm0, xmm0 // Last 4
|
||||
por xmm3, xmm5 // GGGA
|
||||
por xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm1
|
||||
movdqu [edx + 16], xmm2
|
||||
movdqu [edx + 32], xmm3
|
||||
movdqu [edx + 48], xmm0
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4618,9 +4599,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
||||
lea eax, [eax + 16]
|
||||
paddusb xmm0, xmm1 // sobel = sobelx + sobely
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4666,12 +4647,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
movdqa xmm7, xmm1 // YSXA
|
||||
punpcklwd xmm7, xmm0 // Next 4
|
||||
punpckhwd xmm1, xmm0 // Last 4
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm6
|
||||
movdqu [edx + 16], xmm4
|
||||
movdqu [edx + 32], xmm7
|
||||
movdqu [edx + 48], xmm1
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -4983,9 +4964,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
movd xmm0, [eax + edi] // read pixel 3
|
||||
punpckldq xmm6, xmm0 // combine pixel 2 and 3
|
||||
addps xmm3, xmm4 // x, y += dx, dy next 2
|
||||
sub ecx, 4
|
||||
movq qword ptr 8[edx], xmm6
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jge l4
|
||||
|
||||
l4b:
|
||||
@ -5001,9 +4982,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
addps xmm2, xmm7 // x, y += dx, dy
|
||||
movd esi, xmm0
|
||||
movd xmm0, [eax + esi] // copy a pixel
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
sub ecx, 1
|
||||
jge l1
|
||||
l1b:
|
||||
pop edi
|
||||
@ -5059,9 +5040,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
vpsrlw ymm0, ymm0, 7
|
||||
vpsrlw ymm1, ymm1, 7
|
||||
vpackuswb ymm0, ymm0, ymm1 // unmutates
|
||||
sub ecx, 32
|
||||
vmovdqu [esi + edi], ymm0
|
||||
lea esi, [esi + 32]
|
||||
sub ecx, 32
|
||||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
@ -5072,9 +5053,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
vmovdqu ymm1, [esi + edx]
|
||||
vpavgb ymm0, ymm0, ymm1
|
||||
vpavgb ymm0, ymm0, ymm1
|
||||
sub ecx, 32
|
||||
vmovdqu [esi + edi], ymm0
|
||||
lea esi, [esi + 32]
|
||||
sub ecx, 32
|
||||
jg xloop25
|
||||
jmp xloop99
|
||||
|
||||
@ -5083,9 +5064,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
xloop50:
|
||||
vmovdqu ymm0, [esi]
|
||||
vpavgb ymm0, ymm0, [esi + edx]
|
||||
sub ecx, 32
|
||||
vmovdqu [esi + edi], ymm0
|
||||
lea esi, [esi + 32]
|
||||
sub ecx, 32
|
||||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
@ -5096,9 +5077,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
vmovdqu ymm0, [esi + edx]
|
||||
vpavgb ymm0, ymm0, ymm1
|
||||
vpavgb ymm0, ymm0, ymm1
|
||||
sub ecx, 32
|
||||
vmovdqu [esi + edi], ymm0
|
||||
lea esi, [esi + 32]
|
||||
sub ecx, 32
|
||||
jg xloop75
|
||||
jmp xloop99
|
||||
|
||||
@ -5161,9 +5142,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm1, 7
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
@ -5174,9 +5155,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm1, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop25
|
||||
jmp xloop99
|
||||
|
||||
@ -5186,9 +5167,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm0, [esi]
|
||||
movdqu xmm1, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
@ -5199,9 +5180,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm0, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop75
|
||||
jmp xloop99
|
||||
|
||||
@ -5209,9 +5190,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
align 4
|
||||
xloop100:
|
||||
movdqu xmm0, [esi]
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop100
|
||||
|
||||
xloop99:
|
||||
@ -5273,9 +5254,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
paddw xmm0, xmm2 // sum rows
|
||||
paddw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
@ -5286,9 +5267,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm1, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop25
|
||||
jmp xloop99
|
||||
|
||||
@ -5298,9 +5279,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm0, [esi]
|
||||
movdqu xmm1, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
@ -5311,9 +5292,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqu xmm0, [esi + edx]
|
||||
pavgb xmm0, xmm1
|
||||
pavgb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop75
|
||||
jmp xloop99
|
||||
|
||||
@ -5321,9 +5302,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
align 4
|
||||
xloop100:
|
||||
movdqu xmm0, [esi]
|
||||
sub ecx, 16
|
||||
movdqu [esi + edi], xmm0
|
||||
lea esi, [esi + 16]
|
||||
sub ecx, 16
|
||||
jg xloop100
|
||||
|
||||
xloop99:
|
||||
@ -5352,9 +5333,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
pshufb xmm0, xmm5
|
||||
pshufb xmm1, xmm5
|
||||
punpckldq xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
ret
|
||||
}
|
||||
@ -5383,9 +5364,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
|
||||
pand xmm1, xmm5
|
||||
packssdw xmm0, xmm1
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
ret
|
||||
}
|
||||
@ -5409,10 +5390,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
lea eax, [eax + 32]
|
||||
pshufb xmm0, xmm5
|
||||
pshufb xmm1, xmm5
|
||||
sub ecx, 8
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
ret
|
||||
}
|
||||
@ -5436,10 +5417,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
lea eax, [eax + 64]
|
||||
vpshufb ymm0, ymm0, ymm5
|
||||
vpshufb ymm1, ymm1, ymm5
|
||||
sub ecx, 16
|
||||
vmovdqu [edx], ymm0
|
||||
vmovdqu [edx + 32], ymm1
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
vzeroupper
|
||||
@ -5502,9 +5483,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
pshufhw xmm1, xmm1, 01Bh
|
||||
pshuflw xmm1, xmm1, 01Bh
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg shuf_0123
|
||||
jmp shuf99
|
||||
|
||||
@ -5520,9 +5501,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
pshufhw xmm1, xmm1, 039h
|
||||
pshuflw xmm1, xmm1, 039h
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg shuf_0321
|
||||
jmp shuf99
|
||||
|
||||
@ -5538,9 +5519,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
pshufhw xmm1, xmm1, 093h
|
||||
pshuflw xmm1, xmm1, 093h
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg shuf_2103
|
||||
jmp shuf99
|
||||
|
||||
@ -5556,9 +5537,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
pshufhw xmm1, xmm1, 0C6h
|
||||
pshuflw xmm1, xmm1, 0C6h
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg shuf_3012
|
||||
|
||||
shuf99:
|
||||
@ -5700,9 +5681,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
||||
cvttps2dq xmm4, xmm4
|
||||
packuswb xmm0, xmm4
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 2
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 2
|
||||
jg convertloop
|
||||
pop esi
|
||||
ret
|
||||
@ -5740,9 +5721,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
|
||||
sub ecx, 2
|
||||
vmovq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 2
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
@ -5905,9 +5886,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
movzx edx, byte ptr [eax + 15] // copy alpha.
|
||||
mov byte ptr [edi + 15], dl
|
||||
|
||||
sub ecx, 4
|
||||
lea eax, [eax + 16]
|
||||
lea edi, [edi + 16]
|
||||
sub ecx, 4
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
|
||||
@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
"paddusw %%xmm0,%%xmm1 \n"
|
||||
"pmulhuw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"sub $0x6,%2 \n"
|
||||
"movd %%xmm1," MEMACCESS(1) " \n"
|
||||
"psrlq $0x10,%%xmm1 \n"
|
||||
"movd %%xmm1," MEMACCESS2(0x2,1) " \n"
|
||||
"lea " MEMLEA(0x6,1) ",%1 \n"
|
||||
"sub $0x6,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
"paddusw %%xmm7,%%xmm6 \n"
|
||||
"pmulhuw %%xmm4,%%xmm6 \n"
|
||||
"packuswb %%xmm6,%%xmm6 \n"
|
||||
"sub $0x6,%2 \n"
|
||||
"movd %%xmm6," MEMACCESS(1) " \n"
|
||||
"psrlq $0x10,%%xmm6 \n"
|
||||
"movd %%xmm6," MEMACCESS2(0x2,1) " \n"
|
||||
"lea " MEMLEA(0x6,1) ",%1 \n"
|
||||
"sub $0x6,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
: "+r"(dst_ptr), // %0
|
||||
@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
"lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
|
||||
"punpckldq %%xmm3,%%xmm2 \n"
|
||||
"punpcklqdq %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm2 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stepx_x4), // %1
|
||||
@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
"pextrw $0x3,%%xmm2,%k1 \n"
|
||||
"punpckldq %%xmm4,%%xmm1 \n"
|
||||
"punpcklqdq %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(2) " \n"
|
||||
"lea " MEMLEA(0x10,2) ",%2 \n"
|
||||
"sub $0x4,%4 \n"
|
||||
"jge 40b \n"
|
||||
|
||||
"49: \n"
|
||||
@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpckldq %%xmm0,%%xmm0 \n"
|
||||
"punpckhdq %%xmm1,%%xmm1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(0) " \n"
|
||||
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
: "+r"(dst_argb), // %0
|
||||
|
||||
@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
psrlw xmm0, 8 // isolate odd pixels.
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
pavgw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
pavgw xmm1, xmm3
|
||||
packuswb xmm0, xmm1
|
||||
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg wloop
|
||||
|
||||
pop esi
|
||||
@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
packuswb xmm0, xmm1
|
||||
psrlw xmm0, 8
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
pavgw xmm0, xmm2
|
||||
packuswb xmm0, xmm0
|
||||
|
||||
sub ecx, 8
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
|
||||
pop edi
|
||||
@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
|
||||
paddsw xmm0, xmm7
|
||||
psrlw xmm0, 2
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 24
|
||||
movq qword ptr [edx + 16], xmm0
|
||||
lea edx, [edx + 24]
|
||||
sub ecx, 24
|
||||
jg wloop
|
||||
|
||||
pop esi
|
||||
@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
|
||||
paddsw xmm0, xmm7
|
||||
psrlw xmm0, 2
|
||||
packuswb xmm0, xmm0
|
||||
sub ecx, 24
|
||||
movq qword ptr [edx + 16], xmm0
|
||||
lea edx, [edx+24]
|
||||
sub ecx, 24
|
||||
jg wloop
|
||||
|
||||
pop esi
|
||||
@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
pshufb xmm1, xmm5
|
||||
paddusb xmm0, xmm1
|
||||
|
||||
sub ecx, 12
|
||||
movq qword ptr [edx], xmm0 // write 12 pixels
|
||||
movhlps xmm1, xmm0
|
||||
movd [edx + 8], xmm1
|
||||
lea edx, [edx + 12]
|
||||
sub ecx, 12
|
||||
jg xloop
|
||||
|
||||
ret
|
||||
@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
|
||||
pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
|
||||
packuswb xmm6, xmm6
|
||||
|
||||
sub ecx, 6
|
||||
movd [edx], xmm6 // write 6 pixels
|
||||
psrlq xmm6, 16
|
||||
movd [edx + 2], xmm6
|
||||
lea edx, [edx + 6]
|
||||
sub ecx, 6
|
||||
jg xloop
|
||||
|
||||
pop esi
|
||||
@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
|
||||
pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
|
||||
packuswb xmm1, xmm1
|
||||
|
||||
sub ecx, 6
|
||||
movd [edx], xmm1 // write 6 pixels
|
||||
psrlq xmm1, 16
|
||||
movd [edx + 2], xmm1
|
||||
lea edx, [edx + 6]
|
||||
sub ecx, 6
|
||||
jg xloop
|
||||
|
||||
pop esi
|
||||
@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0
|
||||
punpckhbw xmm1, xmm1
|
||||
sub ecx, 32
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 32
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
|
||||
movdqu xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
shufps xmm0, xmm1, 0xdd
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
|
||||
shufps xmm0, xmm1, 0x88 // even pixels
|
||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||
pavgb xmm0, xmm2
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
|
||||
shufps xmm0, xmm1, 0x88 // even pixels
|
||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||
pavgb xmm0, xmm2
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg wloop
|
||||
|
||||
pop esi
|
||||
@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
lea eax, [eax + ebx * 4]
|
||||
punpckldq xmm2, xmm3
|
||||
punpcklqdq xmm0, xmm2
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg wloop
|
||||
|
||||
pop edi
|
||||
@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
|
||||
shufps xmm0, xmm1, 0x88 // even pixels
|
||||
shufps xmm2, xmm1, 0xdd // odd pixels
|
||||
pavgb xmm0, xmm2
|
||||
sub ecx, 4
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
jg wloop
|
||||
|
||||
pop edi
|
||||
@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
pextrw edx, xmm2, 3 // get x1 integer. next iteration.
|
||||
punpckldq xmm1, xmm4 // x2 x3
|
||||
punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
|
||||
sub ecx, 4 // 4 pixels
|
||||
movdqu [edi], xmm0
|
||||
lea edi, [edi + 16]
|
||||
sub ecx, 4 // 4 pixels
|
||||
jge xloop4
|
||||
|
||||
align 4
|
||||
@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
|
||||
movdqa xmm1, xmm0
|
||||
punpckldq xmm0, xmm0
|
||||
punpckhdq xmm1, xmm1
|
||||
sub ecx, 8
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg wloop
|
||||
|
||||
ret
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user