From 91f240c5db1116f3baaa79fa34e3180e943bf2ed Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 20 Nov 2014 21:14:27 +0000 Subject: [PATCH] Move sub before branch for loops. Remove CopyRow_x86 Add CopyRow_Any versions for AVX, SSE2 and Neon. BUG=269 TESTED=local build R=harryjin@google.com, tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/26209004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1175 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 5 +- include/libyuv/version.h | 2 +- source/compare_posix.cc | 4 +- source/compare_win.cc | 8 +- source/convert.cc | 41 +++----- source/planar_functions.cc | 22 ++--- source/rotate.cc | 23 ++--- source/rotate_argb.cc | 23 ++--- source/row_any.cc | 23 ++++- source/row_posix.cc | 157 ++++++++++++++----------------- source/row_win.cc | 187 +++++++++++++++++-------------------- source/scale_posix.cc | 20 ++-- source/scale_win.cc | 36 +++---- 14 files changed, 254 insertions(+), 299 deletions(-) diff --git a/README.chromium b/README.chromium index 1b82686cc..17a4ee79f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1174 +Version: 1175 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b2059a619..29b364607 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -111,7 +111,6 @@ extern "C" { #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 -#define HAS_COPYROW_X86 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 @@ -877,10 +876,12 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_AVX(const uint8* src, uint8* dst, int count); void CopyRow_ERMS(const uint8* src, uint8* dst, int count); -void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count); +void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count); +void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count); void CopyRow_16_C(const uint16* src, uint16* dst, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2dbb5e0ce..9fb4864c7 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1174 +#define LIBYUV_VERSION 1175 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare_posix.cc b/source/compare_posix.cc index 64dfc3578..93c4fdfad 100644 --- a/source/compare_posix.cc +++ b/source/compare_posix.cc @@ -29,7 +29,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "lea " MEMLEA(0x10, 0) ",%0 \n" "movdqu " MEMACCESS(1) ",%%xmm2 \n" "lea " MEMLEA(0x10, 1) ",%1 \n" - "sub $0x10,%2 \n" "movdqa %%xmm1,%%xmm3 \n" "psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm3,%%xmm2 \n" @@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "pmaddwd %%xmm2,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n" + "sub $0x10,%2 \n" "jg 1b \n" "pshufd $0xee,%%xmm0,%%xmm1 \n" @@ -124,13 +124,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { "pmulld %%xmm5,%%xmm1 \n" "paddd %%xmm4,%%xmm3 \n" "paddd %%xmm2,%%xmm1 \n" - "sub $0x10,%1 \n" "paddd %%xmm3,%%xmm1 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" + "sub $0x10,%1 \n" "jg 1b \n" "movd %%xmm0,%3 \n" : "+r"(src), // %0 diff --git a/source/compare_win.cc b/source/compare_win.cc index 50d4d3464..0b6922abd 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -33,7 +33,6 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { lea eax, [eax + 16] movdqu xmm2, [edx] lea edx, [edx + 16] - sub ecx, 16 movdqa xmm3, xmm1 // abs trick psubusb xmm1, xmm2 psubusb xmm2, xmm3 @@ -45,6 +44,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { pmaddwd xmm2, xmm2 paddd xmm0, xmm1 paddd xmm0, xmm2 + sub ecx, 16 jg wloop pshufd xmm1, xmm0, 0xee @@ -75,7 +75,6 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vmovdqu ymm1, [eax] vmovdqu ymm2, [eax + edx] lea eax, [eax + 32] - sub ecx, 32 vpsubusb ymm3, ymm1, ymm2 // abs difference trick vpsubusb ymm2, ymm2, ymm1 vpor ymm1, ymm2, ymm3 @@ -85,6 +84,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vpmaddwd ymm1, ymm1, ymm1 vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm2 + sub ecx, 32 jg wloop vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. @@ -170,7 +170,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { pmulld(0xcd) // pmulld xmm1, xmm5 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 - sub ecx, 16 paddd xmm1, xmm3 pshufd xmm2, xmm1, 0x0e // upper 2 dwords @@ -178,6 +177,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 + sub ecx, 16 jg wloop movd eax, xmm0 // return hash @@ -209,13 +209,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { pmulld xmm1, kHashMul3 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 - sub ecx, 16 paddd xmm1, xmm3 pshufd xmm2, xmm1, 0x0e // upper 2 dwords paddd xmm1, xmm2 pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 + sub ecx, 16 jg wloop movd eax, xmm0 // return hash diff --git a/source/convert.cc b/source/convert.cc index 79456a6a4..5f58f55af 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -188,19 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, int width, int height) { int y; void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { - CopyRow = CopyRow_AVX; + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -209,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, } #endif #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MIPS) @@ -419,24 +414,14 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, dst_stride_v = -dst_stride_v; } // CopyRow for rows of just Y in Q420 copied to Y plane of I420. -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { - CopyRow = CopyRow_AVX; + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -444,12 +429,16 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_ERMS; } #endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; } #endif - #if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 788ce09f3..db8699cf3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -41,19 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y, if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { - CopyRow = CopyRow_AVX; + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -62,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MIPS) @@ -93,11 +88,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_y = 0; } -#if defined(HAS_COPYROW_16_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_16_X86; - } -#endif #if defined(HAS_COPYROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; diff --git a/source/rotate.cc b/source/rotate.cc index f65f89dda..d4b581d19 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -918,24 +918,14 @@ void RotatePlane180(const uint8* src, int src_stride, MirrorRow = MirrorRow_MIPS_DSPR2; } #endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { - CopyRow = CopyRow_AVX; + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -943,6 +933,11 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = CopyRow_ERMS; } #endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index f40416af5..d65ba8c23 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -125,24 +125,14 @@ void ARGBRotate180(const uint8* src, int src_stride, } } #endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_X86) - if (TestCpuFlag(kCpuHasX86)) { - CopyRow = CopyRow_X86; - } -#endif #if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32)) { - CopyRow = CopyRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) - if (TestCpuFlag(kCpuHasAVX) && IS_ALIGNED(width, 64)) { - CopyRow = CopyRow_AVX; + if (TestCpuFlag(kCpuHasAVX)) { + CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) @@ -150,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = CopyRow_ERMS; } #endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; diff --git a/source/row_any.cc b/source/row_any.cc index e9de3dab5..54b117567 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -621,8 +621,6 @@ NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C, #endif #undef NANY - - #define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \ void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ int n = width & ~MASK; \ @@ -659,6 +657,27 @@ MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3) #endif #undef MANY +#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \ + void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ + int n = width & ~MASK; \ + int r = width & MASK; \ + if (n > 0) { \ + COPY_SIMD(src_y, dst_y, n); \ + } \ + COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \ + } + +#ifdef HAS_COPYROW_AVX +MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31) +#endif +#undef MANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index f459f3233..949a8d120 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -296,9 +296,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "pshufb %%xmm4,%%xmm3 \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" - "sub $0x10,%2 \n" "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 @@ -337,9 +337,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "pshufb %%xmm4,%%xmm3 \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" - "sub $0x10,%2 \n" "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 @@ -725,9 +725,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 @@ -765,9 +765,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 @@ -837,10 +837,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -910,10 +910,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -961,7 +961,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" @@ -980,6 +979,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "lea " MEMLEA(0x40,0) ",%0 \n" MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 @@ -1038,10 +1038,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1080,9 +1080,9 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 @@ -1145,10 +1145,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 @@ -1186,9 +1186,9 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 @@ -1223,9 +1223,9 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 @@ -1288,10 +1288,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 @@ -1357,10 +1357,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movlps %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 @@ -2186,9 +2186,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "1: \n" MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -2215,9 +2215,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "sub $0x20,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -2249,9 +2249,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { "pshuflw $0x1b,%%xmm0,%%xmm0 \n" "pshufhw $0x1b,%%xmm0,%%xmm0 \n" "pshufd $0x4e,%%xmm0,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1)",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -2285,10 +2285,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "movdqu " MEMACCESS(0) ",%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" "pshufb %%xmm1,%%xmm0 \n" - "sub $8,%3 \n" "movlpd %%xmm0," MEMACCESS(1) " \n" MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $8,%3 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 @@ -2322,9 +2322,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { "movdqu " MEMACCESS(0) ",%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "lea " MEMLEA(-0x10,0) ",%0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -2346,13 +2346,13 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = { void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( - "vmovdqa %3,%%ymm5 \n" + "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0 - "sub $0x20,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 @@ -2574,21 +2574,6 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_AVX -#ifdef HAS_COPYROW_X86 -void CopyRow_X86(const uint8* src, uint8* dst, int width) { - size_t width_tmp = (size_t)(width); - asm volatile ( - "shr $0x2,%2 \n" - "rep movsl " MEMMOVESTRING(0,1) " \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc" - ); -} -#endif // HAS_COPYROW_X86 - #ifdef HAS_COPYROW_ERMS // Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { @@ -2894,9 +2879,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -3006,9 +2991,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "sub $0x20,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 @@ -3119,9 +3104,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "sub $0x20,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 @@ -3263,9 +3248,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 10b \n" "19: \n" @@ -3295,9 +3280,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jge 41b \n" "49: \n" @@ -3326,9 +3311,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -3398,9 +3383,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 10b \n" "19: \n" @@ -3428,9 +3413,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jge 40b \n" "49: \n" @@ -3457,9 +3442,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%3 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x4,2) ",%2 \n" + "sub $0x1,%3 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -3505,9 +3490,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "packuswb %%xmm1,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3558,9 +3543,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3603,9 +3588,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "sub $0x8,%2 \n" MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 @@ -3651,9 +3636,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pmulhuw %%xmm2,%%xmm1 \n" "lea " MEMLEA(0x10,0) ",%0 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3723,9 +3708,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "sub $0x8,%2 \n" MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 @@ -3776,10 +3761,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" - "sub $0x8,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3853,10 +3838,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" - "sub $0x8,%1 \n" "movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -3919,11 +3904,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "movdqa %%xmm0,%%xmm6 \n" "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" - "sub $0x8,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3972,9 +3957,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "paddw %%xmm4,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" - "sub $0x4,%1 \n" "movdqu %%xmm0," MEMACCESS(0) " \n" "lea " MEMLEA(0x10,0) ",%0 \n" + "sub $0x4,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4011,9 +3996,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4050,9 +4035,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4119,9 +4104,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4179,9 +4164,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqu " MEMACCESS(1) ",%%xmm1 \n" "lea " MEMLEA(0x10,1) ",%1 \n" "psubusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4264,9 +4249,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%4 \n" MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -4322,9 +4307,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x8,%3 \n" MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x8,%3 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -4375,12 +4360,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpckhwd %%xmm0,%%xmm0 \n" "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" - "sub $0x10,%3 \n" "movdqu %%xmm1," MEMACCESS(2) " \n" "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n" "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -4414,9 +4399,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" - "sub $0x10,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -4466,12 +4451,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "movdqa %%xmm1,%%xmm7 \n" "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "sub $0x10,%3 \n" "movdqu %%xmm6," MEMACCESS(2) " \n" "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n" "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n" "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n" "lea " MEMLEA(0x40,2) ",%2 \n" + "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -4757,9 +4742,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 "punpckldq %%xmm6,%%xmm0 \n" "addps %%xmm4,%%xmm3 \n" - "sub $0x4,%4 \n" "movq %%xmm0," MEMACCESS2(0x08,2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" "jge 40b \n" "49: \n" @@ -4775,9 +4760,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "addps %%xmm7,%%xmm2 \n" "movd %%xmm0,%k1 \n" MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 - "sub $0x1,%4 \n" "movd %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x04,2) ",%2 \n" + "sub $0x1,%4 \n" "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 @@ -4836,9 +4821,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" @@ -4849,9 +4834,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 25b \n" "jmp 99f \n" @@ -4861,9 +4846,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 50b \n" "jmp 99f \n" @@ -4874,9 +4859,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm0) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 75b \n" "jmp 99f \n" @@ -4884,9 +4869,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, LABELALIGN "100: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 100b \n" "99: \n" @@ -4952,9 +4937,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" @@ -4965,9 +4950,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 25b \n" "jmp 99f \n" @@ -4977,9 +4962,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "movdqu " MEMACCESS(1) ",%%xmm0 \n" MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 50b \n" "jmp 99f \n" @@ -4990,9 +4975,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 75b \n" "jmp 99f \n" @@ -5000,9 +4985,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, LABELALIGN "100: \n" "movdqu " MEMACCESS(1) ",%%xmm0 \n" - "sub $0x10,%2 \n" MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 100b \n" "99: \n" @@ -5037,9 +5022,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "punpckldq %%xmm1,%%xmm0 \n" - "sub $0x8,%2 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 @@ -5070,9 +5055,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, "pand %%xmm5,%%xmm1 \n" "packssdw %%xmm1,%%xmm0 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x8,%2 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 @@ -5099,10 +5084,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" - "sub $0x8,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -5129,10 +5114,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, "lea " MEMLEA(0x40,0) ",%0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "sub $0x10,%2 \n" "vmovdqu %%ymm0," MEMACCESS(1) " \n" "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 @@ -5196,9 +5181,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x1b,%%xmm1,%%xmm1 \n" "pshuflw $0x1b,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 123b \n" "jmp 99f \n" @@ -5214,9 +5199,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x39,%%xmm1,%%xmm1 \n" "pshuflw $0x39,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 321b \n" "jmp 99f \n" @@ -5232,9 +5217,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0x93,%%xmm1,%%xmm1 \n" "pshuflw $0x93,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 2103b \n" "jmp 99f \n" @@ -5250,9 +5235,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pshufhw $0xc6,%%xmm1,%%xmm1 \n" "pshuflw $0xc6,%%xmm1,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%3 \n" "jg 3012b \n" "99: \n" @@ -5394,9 +5379,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, "cvttps2dq %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "sub $0x2,%2 \n" "movq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -5435,9 +5420,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" - "sub $0x2,%2 \n" "vmovq %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x2,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 @@ -5597,9 +5582,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, "mov %b0," MEMACCESS2(0xe,3) " \n" "movzb " MEMACCESS2(0xf,2) ",%0 \n" "mov %b0," MEMACCESS2(0xf,3) " \n" - "sub $0x4,%4 \n" "lea " MEMLEA(0x10,2) ",%2 \n" "lea " MEMLEA(0x10,3) ",%3 \n" + "sub $0x4,%4 \n" "jg 1b \n" : "+d"(pixel_temp), // %0 "+a"(table_temp), // %1 diff --git a/source/row_win.cc b/source/row_win.cc index aea284d9a..3eb16ed3f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -313,9 +313,9 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 - sub ecx, 16 movdqu [edx + 48], xmm3 lea edx, [edx + 64] + sub ecx, 16 jg convertloop ret } @@ -353,9 +353,9 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 - sub ecx, 16 movdqu [edx + 48], xmm3 lea edx, [edx + 64] + sub ecx, 16 jg convertloop ret } @@ -728,9 +728,9 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -764,9 +764,9 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -782,7 +782,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov ecx, [esp + 12] /* pix */ vbroadcastf128 ymm4, kARGBToY vbroadcastf128 ymm5, kAddY16 - vmovdqa ymm6, kPermdARGBToY_AVX + vmovdqu ymm6, kPermdARGBToY_AVX align 4 convertloop: @@ -802,9 +802,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpaddb ymm0, ymm0, ymm5 - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -822,7 +822,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov ecx, [esp + 12] /* pix */ vbroadcastf128 ymm4, kARGBToYJ vbroadcastf128 ymm5, kAddYJ64 - vmovdqa ymm6, kPermdARGBToY_AVX + vmovdqu ymm6, kPermdARGBToY_AVX align 4 convertloop: @@ -843,9 +843,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper @@ -880,9 +880,9 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -914,9 +914,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -948,9 +948,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -1015,10 +1015,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1087,10 +1087,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, packsswb xmm0, xmm1 // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1152,10 +1152,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values - sub ecx, 32 vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] + sub ecx, 32 jg convertloop pop edi @@ -1197,7 +1197,6 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, psraw xmm2, 8 packsswb xmm0, xmm2 paddb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 movdqu xmm0, [eax] // V @@ -1217,6 +1216,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, lea eax, [eax + 64] movdqu [edx + edi], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop pop edi @@ -1272,10 +1272,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1342,10 +1342,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1413,10 +1413,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -1484,10 +1484,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values - sub ecx, 16 movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] + sub ecx, 16 jg convertloop pop edi @@ -2043,9 +2043,9 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, por xmm3, xmm2 // BG por xmm1, xmm3 // BGR packssdw xmm0, xmm1 - sub ecx, 8 movdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] + sub ecx, 8 jg convertloop pop edi @@ -2411,9 +2411,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { convertloop: movdqu xmm0, [eax - 16 + ecx] pshufb xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -2434,9 +2434,9 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { vmovdqu ymm0, [eax - 32 + ecx] vpshufb ymm0, ymm0, ymm5 vpermq ymm0, ymm0, 0x4e // swap high and low halfs - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -2462,9 +2462,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { pshuflw xmm0, xmm0, 0x1b // swap words pshufhw xmm0, xmm0, 0x1b pshufd xmm0, xmm0, 0x4e // swap qwords - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -2495,10 +2495,10 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm1 - sub ecx, 8 movlpd qword ptr [edx], xmm0 movhpd qword ptr [edx + edi], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg convertloop pop edi @@ -2527,9 +2527,9 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret } @@ -2548,14 +2548,14 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - vmovdqa ymm5, kARGBShuffleMirror_AVX2 + vmovdqu ymm5, kARGBShuffleMirror_AVX2 align 4 convertloop: vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order - sub ecx, 8 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 8 jg convertloop vzeroupper ret @@ -2773,25 +2773,6 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { } } -#ifdef HAS_COPYROW_X86 -// Multiple of 4. -__declspec(naked) __declspec(align(16)) -void CopyRow_X86(const uint8* src, uint8* dst, int count) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count - shr ecx, 2 - rep movsd - mov edi, edx - mov esi, eax - ret - } -} -#endif // HAS_COPYROW_X86 - #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels __declspec(naked) __declspec(align(16)) @@ -2998,9 +2979,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -3109,9 +3090,9 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 - sub ecx, 32 vmovdqu [edx], ymm0 lea edx, [edx + 32] + sub ecx, 32 jg convertloop vzeroupper ret @@ -3223,9 +3204,9 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -3328,9 +3309,9 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop ret } @@ -3466,9 +3447,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge alignloop1 alignloop1b: @@ -3497,9 +3478,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jge convertloop4 convertloop4b: @@ -3528,9 +3509,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop1b: @@ -3598,9 +3579,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge alignloop1 alignloop1b: @@ -3627,9 +3608,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jge convertloop4 convertloop4b: @@ -3656,9 +3637,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop1b: @@ -3701,9 +3682,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { packuswb xmm0, xmm1 pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -3750,9 +3731,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { psrlw xmm1, 8 packuswb xmm0, xmm1 por xmm0, xmm2 // copy original alpha - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -3790,9 +3771,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // unmutated. vpor ymm0, ymm0, ymm6 // copy original alpha - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop vzeroupper @@ -3839,9 +3820,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, lea eax, [eax + 16] packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop edi pop esi @@ -3883,9 +3864,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop vzeroupper @@ -3945,9 +3926,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. - sub ecx, 8 vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + sub ecx, 8 jg convertloop pop edi @@ -3993,10 +3974,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { movdqa xmm1, xmm0 punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 - sub ecx, 8 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg convertloop ret } @@ -4064,10 +4045,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { movdqa xmm1, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 - sub ecx, 8 movdqu [eax], xmm0 movdqu [eax + 16], xmm1 lea eax, [eax + 32] + sub ecx, 8 jg convertloop ret } @@ -4128,11 +4109,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movdqa xmm6, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 - sub ecx, 8 movdqu [edx], xmm0 movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] + sub ecx, 8 jg convertloop ret } @@ -4176,9 +4157,9 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 - sub ecx, 4 movdqu [eax], xmm0 lea eax, [eax + 16] + sub ecx, 4 jg convertloop ret } @@ -4210,9 +4191,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop ret @@ -4248,9 +4229,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop esi @@ -4282,9 +4263,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] paddusb xmm0, xmm1 // src_argb0 + src_argb1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jge convertloop4 convertloop49: @@ -4297,9 +4278,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] paddusb xmm0, xmm1 // src_argb0 + src_argb1 - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge convertloop1 convertloop19: @@ -4328,9 +4309,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] psubusb xmm0, xmm1 // src_argb0 - src_argb1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg convertloop pop esi @@ -4482,9 +4463,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] + sub ecx, 8 jg convertloop pop edi @@ -4536,9 +4517,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] + sub ecx, 8 jg convertloop pop esi @@ -4585,12 +4566,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, punpckhwd xmm0, xmm0 // Last 4 por xmm3, xmm5 // GGGA por xmm0, xmm5 - sub ecx, 16 movdqu [edx], xmm1 movdqu [edx + 16], xmm2 movdqu [edx + 32], xmm3 movdqu [edx + 48], xmm0 lea edx, [edx + 64] + sub ecx, 16 jg convertloop pop esi @@ -4618,9 +4599,9 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg convertloop pop esi @@ -4666,12 +4647,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, movdqa xmm7, xmm1 // YSXA punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 - sub ecx, 16 movdqu [edx], xmm6 movdqu [edx + 16], xmm4 movdqu [edx + 32], xmm7 movdqu [edx + 48], xmm1 lea edx, [edx + 64] + sub ecx, 16 jg convertloop pop esi @@ -4983,9 +4964,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, movd xmm0, [eax + edi] // read pixel 3 punpckldq xmm6, xmm0 // combine pixel 2 and 3 addps xmm3, xmm4 // x, y += dx, dy next 2 - sub ecx, 4 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] + sub ecx, 4 jge l4 l4b: @@ -5001,9 +4982,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel - sub ecx, 1 movd [edx], xmm0 lea edx, [edx + 4] + sub ecx, 1 jge l1 l1b: pop edi @@ -5059,9 +5040,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpsrlw ymm0, ymm0, 7 vpsrlw ymm1, ymm1, 7 vpackuswb ymm0, ymm0, ymm1 // unmutates - sub ecx, 32 vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] + sub ecx, 32 jg xloop jmp xloop99 @@ -5072,9 +5053,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vmovdqu ymm1, [esi + edx] vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1 - sub ecx, 32 vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] + sub ecx, 32 jg xloop25 jmp xloop99 @@ -5083,9 +5064,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] - sub ecx, 32 vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] + sub ecx, 32 jg xloop50 jmp xloop99 @@ -5096,9 +5077,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vmovdqu ymm0, [esi + edx] vpavgb ymm0, ymm0, ymm1 vpavgb ymm0, ymm0, ymm1 - sub ecx, 32 vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] + sub ecx, 32 jg xloop75 jmp xloop99 @@ -5161,9 +5142,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop jmp xloop99 @@ -5174,9 +5155,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop25 jmp xloop99 @@ -5186,9 +5167,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop50 jmp xloop99 @@ -5199,9 +5180,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop75 jmp xloop99 @@ -5209,9 +5190,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, align 4 xloop100: movdqu xmm0, [esi] - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop100 xloop99: @@ -5273,9 +5254,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, paddw xmm0, xmm2 // sum rows paddw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop jmp xloop99 @@ -5286,9 +5267,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop25 jmp xloop99 @@ -5298,9 +5279,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop50 jmp xloop99 @@ -5311,9 +5292,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop75 jmp xloop99 @@ -5321,9 +5302,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, align 4 xloop100: movdqu xmm0, [esi] - sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] + sub ecx, 16 jg xloop100 xloop99: @@ -5352,9 +5333,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, pshufb xmm0, xmm5 pshufb xmm1, xmm5 punpckldq xmm0, xmm1 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop ret } @@ -5383,9 +5364,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, pand xmm1, xmm5 packssdw xmm0, xmm1 packuswb xmm0, xmm1 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop ret } @@ -5409,10 +5390,10 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 - sub ecx, 8 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg wloop ret } @@ -5436,10 +5417,10 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, lea eax, [eax + 64] vpshufb ymm0, ymm0, ymm5 vpshufb ymm1, ymm1, ymm5 - sub ecx, 16 vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] + sub ecx, 16 jg wloop vzeroupper @@ -5502,9 +5483,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 01Bh pshuflw xmm1, xmm1, 01Bh packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_0123 jmp shuf99 @@ -5520,9 +5501,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 039h pshuflw xmm1, xmm1, 039h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_0321 jmp shuf99 @@ -5538,9 +5519,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 093h pshuflw xmm1, xmm1, 093h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_2103 jmp shuf99 @@ -5556,9 +5537,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshufhw xmm1, xmm1, 0C6h pshuflw xmm1, xmm1, 0C6h packuswb xmm0, xmm1 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg shuf_3012 shuf99: @@ -5700,9 +5681,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, cvttps2dq xmm4, xmm4 packuswb xmm0, xmm4 packuswb xmm0, xmm0 - sub ecx, 2 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 2 jg convertloop pop esi ret @@ -5740,9 +5721,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 - sub ecx, 2 vmovq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 2 jg convertloop vzeroupper ret @@ -5905,9 +5886,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, movzx edx, byte ptr [eax + 15] // copy alpha. mov byte ptr [edi + 15], dl - sub ecx, 4 lea eax, [eax + 16] lea edi, [edi + 16] + sub ecx, 4 jg convertloop pop edi diff --git a/source/scale_posix.cc b/source/scale_posix.cc index e1c442aa2..00168dfb4 100644 --- a/source/scale_posix.cc +++ b/source/scale_posix.cc @@ -534,11 +534,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, "paddusw %%xmm0,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "sub $0x6,%2 \n" "movd %%xmm1," MEMACCESS(1) " \n" "psrlq $0x10,%%xmm1 \n" "movd %%xmm1," MEMACCESS2(0x2,1) " \n" "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -602,11 +602,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, "paddusw %%xmm7,%%xmm6 \n" "pmulhuw %%xmm4,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" - "sub $0x6,%2 \n" "movd %%xmm6," MEMACCESS(1) " \n" "psrlq $0x10,%%xmm6 \n" "movd %%xmm6," MEMACCESS2(0x2,1) " \n" "lea " MEMLEA(0x6,1) ",%1 \n" + "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -765,10 +765,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" - "sub $0x20,%2 \n" "movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x20,%2 \n" "jg 1b \n" : "+r"(dst_ptr), // %0 @@ -792,9 +792,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" "lea " MEMLEA(0x20,0) ",%0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -820,9 +820,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -852,9 +852,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -890,9 +890,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" "punpckldq %%xmm3,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 @@ -941,9 +941,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 @@ -997,9 +997,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, "pextrw $0x3,%%xmm2,%k1 \n" "punpckldq %%xmm4,%%xmm1 \n" "punpcklqdq %%xmm1,%%xmm0 \n" - "sub $0x4,%4 \n" "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%4 \n" "jge 40b \n" "49: \n" @@ -1046,10 +1046,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqa %%xmm0,%%xmm1 \n" "punpckldq %%xmm0,%%xmm0 \n" "punpckhdq %%xmm1,%%xmm1 \n" - "sub $0x8,%2 \n" "movdqu %%xmm0," MEMACCESS(0) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n" "lea " MEMLEA(0x20,0) ",%0 \n" + "sub $0x8,%2 \n" "jg 1b \n" : "+r"(dst_argb), // %0 diff --git a/source/scale_win.cc b/source/scale_win.cc index 8370ef493..0f87692e1 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -111,9 +111,9 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop ret @@ -149,9 +149,9 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pavgw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop ret @@ -192,9 +192,9 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pavgw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 16 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 16 jg wloop pop esi @@ -226,9 +226,9 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, packuswb xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop ret @@ -285,9 +285,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pavgw xmm0, xmm2 packuswb xmm0, xmm0 - sub ecx, 8 movq qword ptr [edx], xmm0 lea edx, [edx + 8] + sub ecx, 8 jg wloop pop edi @@ -398,9 +398,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx + 24] + sub ecx, 24 jg wloop pop esi @@ -460,9 +460,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - sub ecx, 24 movq qword ptr [edx + 16], xmm0 lea edx, [edx+24] + sub ecx, 24 jg wloop pop esi @@ -493,11 +493,11 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, pshufb xmm1, xmm5 paddusb xmm0, xmm1 - sub ecx, 12 movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] + sub ecx, 12 jg xloop ret @@ -558,11 +558,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 - sub ecx, 6 movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] + sub ecx, 6 jg xloop pop esi @@ -604,11 +604,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 - sub ecx, 6 movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] + sub ecx, 6 jg xloop pop esi @@ -784,10 +784,10 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 - sub ecx, 32 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 32 jg wloop ret @@ -812,9 +812,9 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, movdqu xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop ret @@ -842,9 +842,9 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop ret @@ -877,9 +877,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop esi @@ -914,9 +914,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, lea eax, [eax + ebx * 4] punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop edi @@ -963,9 +963,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 - sub ecx, 4 movdqu [edx], xmm0 lea edx, [edx + 16] + sub ecx, 4 jg wloop pop edi @@ -1021,9 +1021,9 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, pextrw edx, xmm2, 3 // get x1 integer. next iteration. punpckldq xmm1, xmm4 // x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 - sub ecx, 4 // 4 pixels movdqu [edi], xmm0 lea edi, [edi + 16] + sub ecx, 4 // 4 pixels jge xloop4 align 4 @@ -1160,10 +1160,10 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 - sub ecx, 8 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] + sub ecx, 8 jg wloop ret