Make row functions used for planarfunctions and convert use movdqu to relax alignment constraint. Step 1 - make functions unaligned.

BUG=365
TESTED=libyuv_unittest passes
R=harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/26709004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1111 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2014-10-03 21:11:37 +00:00
parent 147bbede9d
commit b720049a54
5 changed files with 455 additions and 512 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1109
Version: 1111
License: BSD
License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1109
#define LIBYUV_VERSION 1111
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -47,9 +47,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_SSE2;
}
#endif
@ -101,9 +99,7 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
}
#endif
#if defined(HAS_COPYROW_16_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
CopyRow = CopyRow_16_SSE2;
}
#endif
@ -254,9 +250,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
#endif
@ -307,14 +301,8 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
@ -385,14 +373,8 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
UYVYToYRow = UYVYToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
UYVYToUV422Row = UYVYToUV422Row_SSE2;
if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
UYVYToYRow = UYVYToYRow_SSE2;
}
}
UYVYToUV422Row = UYVYToUV422Row_SSE2;
UYVYToYRow = UYVYToYRow_SSE2;
}
}
#endif
@ -504,9 +486,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
}
#if defined(HAS_ARGBMIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
ARGBMirrorRow = ARGBMirrorRow_SSSE3;
}
#endif
@ -824,10 +804,7 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
I422ToBGRARow = I422ToBGRARow_SSSE3;
}
}
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
@ -894,10 +871,7 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
I422ToABGRRow = I422ToABGRRow_SSSE3;
}
}
#endif
@ -956,10 +930,7 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
I422ToRGBARow = I422ToRGBARow_SSSE3;
}
I422ToRGBARow = I422ToRGBARow_SSSE3;
}
}
#endif
@ -1084,9 +1055,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) &&
IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON;
}
#endif
@ -1150,8 +1119,7 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
@ -1202,9 +1170,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBATTENUATEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@ -1317,9 +1283,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
@ -1355,8 +1319,7 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBGRAYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBGrayRow = ARGBGrayRow_SSSE3;
}
#elif defined(HAS_ARGBGRAYROW_NEON)
@ -1388,8 +1351,7 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBSEPIAROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBSepiaRow = ARGBSepiaRow_SSSE3;
}
#elif defined(HAS_ARGBSEPIAROW_NEON)
@ -1430,8 +1392,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
}
#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
@ -1573,8 +1534,7 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
dst_stride_argb = 0;
}
#if defined(HAS_ARGBQUANTIZEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
}
#elif defined(HAS_ARGBQUANTIZEROW_NEON)
@ -1748,9 +1708,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBSHADEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
ARGBShadeRow = ARGBShadeRow_SSE2;
}
#elif defined(HAS_ARGBSHADEROW_NEON)
@ -1882,11 +1840,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
ARGBShuffleRow = ARGBShuffleRow_SSSE3;
}
}
#endif
@ -1942,8 +1896,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
@ -1951,8 +1904,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@ -2043,8 +1995,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelRow_C;
#if defined(HAS_SOBELROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelRow = SobelRow_SSE2;
}
#endif
@ -2065,8 +2016,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_, int width) = SobelToPlaneRow_C;
#if defined(HAS_SOBELTOPLANEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelToPlaneRow = SobelToPlaneRow_SSE2;
}
#endif
@ -2088,8 +2038,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) = SobelXYRow_C;
#if defined(HAS_SOBELXYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
SobelXYRow = SobelXYRow_SSE2;
}
#endif
@ -2213,10 +2162,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
src_stride_argb = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
@ -2259,10 +2205,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
src_stride_y = dst_stride_argb = 0;
}
#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif

View File

@ -221,7 +221,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"1: \n"
"movq " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x8,0) ",%0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -252,8 +252,8 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -318,17 +318,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
@ -359,17 +359,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
@ -418,8 +418,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -475,8 +475,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -519,8 +519,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
"lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -631,7 +631,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
@ -672,7 +672,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xf,%%xmm7 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
@ -712,7 +712,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psrlw $0x8,%%xmm3 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
"pand %%xmm4,%%xmm1 \n"
@ -744,10 +744,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -760,7 +760,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -820,10 +820,10 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"movdqa %4,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -837,7 +837,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -912,10 +912,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@ -979,10 +979,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@ -1187,10 +1187,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -1202,11 +1202,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
@ -1219,7 +1219,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"paddb %%xmm5,%%xmm0 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
BUNDLEALIGN
MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -1317,10 +1317,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
@ -1430,10 +1430,10 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -1446,7 +1446,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
@ -1513,10 +1513,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@ -1640,10 +1640,10 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -1656,7 +1656,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
@ -1714,10 +1714,10 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"movdqa %3,%%xmm4 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@ -1730,7 +1730,7 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
@ -1797,10 +1797,10 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@ -1933,10 +1933,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
BUNDLEALIGN
MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
@ -2199,8 +2199,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2354,8 +2354,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2393,8 +2393,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2430,8 +2430,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2464,8 +2464,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
"lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2686,8 +2686,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
"movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
"movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
"lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2725,8 +2725,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
"movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
"movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
"movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
"lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2765,8 +2765,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
"movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
"movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
"movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
"movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
"lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
@ -2939,8 +2939,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
@ -2970,7 +2970,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
"lea " MEMLEA(-0x10,0) ",%0 \n"
LABELALIGN
"1: \n"
MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
@ -3104,8 +3104,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
@ -3115,8 +3115,8 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
"movdqu %%xmm0," MEMACCESS(1) " \n"
MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
@ -3183,7 +3183,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
@ -3405,16 +3405,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
"punpcklbw %%xmm2,%%xmm2 \n"
"punpckhwd %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm2,%%xmm2 \n"
"movdqa " MEMACCESS(1) ",%%xmm4 \n"
"movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"movdqu " MEMACCESS(1) ",%%xmm4 \n"
"movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
"pand %%xmm0,%%xmm2 \n"
"pand %%xmm0,%%xmm3 \n"
"pand %%xmm1,%%xmm4 \n"
"pand %%xmm1,%%xmm5 \n"
"por %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm3 \n"
"movdqa %%xmm2," MEMACCESS(1) " \n"
"movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@ -3498,13 +3498,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
"psrlw $0x8,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
@ -3527,11 +3527,11 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@ -3572,8 +3572,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
@ -3722,14 +3722,14 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@ -3751,11 +3751,11 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
BUNDLEALIGN
MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
"lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
@ -3796,8 +3796,8 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
@ -4014,7 +4014,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
@ -4132,16 +4132,16 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
LABELALIGN
"40: \n"
"movdqa " MEMACCESS(0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
"movdqa " MEMACCESS(1) ",%%xmm1 \n"
"movdqu " MEMACCESS(1) ",%%xmm1 \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
@ -4151,7 +4151,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
"jmp 49f \n"
@ -4178,7 +4178,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
@ -4237,17 +4237,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n"
@ -4256,7 +4256,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -4389,16 +4389,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x18,%%xmm2 \n"
"psrld $0x18,%%xmm3 \n"
@ -4411,8 +4411,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -4455,30 +4455,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm6 \n"
"phaddw %%xmm6,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm5 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm5 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm5 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm5 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"psrld $0x18,%%xmm6 \n"
"psrld $0x18,%%xmm1 \n"
"packuswb %%xmm1,%%xmm6 \n"
@ -4488,8 +4488,8 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
@ -4520,12 +4520,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm7 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddsw %%xmm7,%%xmm0 \n"
@ -4535,13 +4535,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"packuswb %%xmm6,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm1 \n"
"movdqa " MEMACCESS(0) ",%%xmm6 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm6 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm5,%%xmm6 \n"
"pmaddubsw %%xmm5,%%xmm7 \n"
"phaddsw %%xmm7,%%xmm6 \n"
@ -4554,8 +4554,8 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
"punpcklwd %%xmm1,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm6 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
@ -4593,14 +4593,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
"movdqa " MEMACCESS(0) ",%%xmm7 \n"
"movdqu " MEMACCESS(0) ",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
@ -4608,7 +4608,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
"movdqa %%xmm0," MEMACCESS(0) " \n"
"movdqu %%xmm0," MEMACCESS(0) " \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
@ -4637,7 +4637,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@ -4648,7 +4648,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -4901,8 +4901,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
@ -4919,10 +4919,10 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm1," MEMACCESS(2) " \n"
"movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
"movdqu %%xmm1," MEMACCESS(2) " \n"
"movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@ -4953,12 +4953,12 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm0," MEMACCESS(2) " \n"
"movdqu %%xmm0," MEMACCESS(2) " \n"
"lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@ -4992,8 +4992,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
"lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"paddusb %%xmm1,%%xmm2 \n"
@ -5010,10 +5010,10 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
"movdqa %%xmm6," MEMACCESS(2) " \n"
"movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
"movdqu %%xmm6," MEMACCESS(2) " \n"
"movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
"movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
"movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
@ -5060,22 +5060,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
"movdqa " MEMACCESS(2) ",%%xmm2 \n"
"movdqu " MEMACCESS(2) ",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
"movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
"movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
"movdqa %%xmm2," MEMACCESS(1) " \n"
"movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
"movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
"movdqu %%xmm2," MEMACCESS(1) " \n"
"movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
"movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
"lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
@ -5140,10 +5140,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel small loop \n"
LABELALIGN
"4: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
@ -5174,10 +5174,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 4 pixel loop \n"
LABELALIGN
"40: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
"movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
BUNDLEALIGN
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
@ -5221,7 +5221,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop \n"
LABELALIGN
"10: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
"lea " MEMLEA(0x10,0) ",%0 \n"
"psubd " MEMACCESS(1) ",%%xmm0 \n"
@ -5822,8 +5822,8 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
"pshufd $0x0,%%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
@ -5852,8 +5852,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
"psrld $0x18,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x8,%%xmm0 \n"
"psrld $0x8,%%xmm1 \n"
@ -5882,17 +5882,17 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa " MEMACCESS(3) ",%%xmm5 \n"
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
"movdqa %%xmm0," MEMACCESS(1) " \n"
"movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"movdqu %%xmm0," MEMACCESS(1) " \n"
"movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
"lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
@ -5909,7 +5909,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
"movdqa " MEMACCESS(3) ",%%xmm5 \n"
"movdqu " MEMACCESS(3) ",%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu " MEMACCESS(0) ",%%xmm0 \n"

File diff suppressed because it is too large Load Diff