diff --git a/README.chromium b/README.chromium index 52f29a6c3..0fe897610 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 254 +Version: 255 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b6797c893..facc89116 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 254 +#define LIBYUV_VERSION 255 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index c82b3918f..68a42ba82 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -163,6 +163,7 @@ static uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { "movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm1 \n" "lea 0x10(%0),%0 \n" @@ -331,7 +332,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" - + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm1 \n" "movdqa (%0,%1,1),%%xmm2 \n" diff --git a/source/convert.cc b/source/convert.cc index 0b1f03c74..1cfb4c4ac 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -74,6 +74,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, mov ecx, [esp + 4 + 16] // pix sub edi, eax + align 16 convertloop: movdqa xmm0, [eax] pavgb xmm0, [eax + edx] @@ -92,6 +93,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "pavgb (%0,%3),%%xmm0 \n" @@ -467,6 +469,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 + align 16 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -506,6 +509,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" diff --git a/source/convert_from.cc b/source/convert_from.cc index efe58dd82..55ff8f5c4 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -291,6 +291,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, uint8* dst_frame, int width) { asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movq (%1),%%xmm2 \n" "movq (%1,%2,1),%%xmm3 \n" @@ -326,6 +327,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, uint8* dst_frame, int width) { asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movq (%1),%%xmm2 \n" "movq (%1,%2,1),%%xmm3 \n" diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 1cdf709e4..dea1491bd 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -57,6 +57,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "movd %3,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" diff --git a/source/rotate.cc b/source/rotate.cc index a029a17bc..b11a0b88d 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -288,6 +288,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. + ".p2align 4 \n" "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" @@ -499,6 +500,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%3),%%xmm1 \n" @@ -639,6 +641,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%4),%%xmm1 \n" diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 70dd4201e..af790aeee 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -32,6 +32,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "sub %4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane + ".p2align 4 \n" "1: \n" "mov r9, %0 \n" @@ -198,6 +199,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "sub %6, #8 \n" // handle 8x8 blocks. this should be the majority of the plane + ".p2align 4 \n" "1: \n" "mov r9, %0 \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index 1a8f4fb8f..122b30933 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -112,6 +112,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" @@ -141,6 +142,7 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( "movdqa %3,%%xmm5 \n" "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -164,6 +166,7 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { asm volatile ( "movdqa %3,%%xmm5 \n" "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -187,6 +190,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -227,6 +231,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -279,6 +284,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -327,6 +333,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -372,6 +379,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0x4,%%xmm5 \n" "sub %0,%1 \n" "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" @@ -405,6 +413,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { asm volatile ( "movdqa %3,%%xmm6 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -445,6 +454,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { asm volatile ( "movdqa %3,%%xmm6 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -491,6 +501,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0x5,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0xb,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -531,6 +542,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xa,%%xmm6 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "pslld $0xf,%%xmm7 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -570,6 +582,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -599,6 +612,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -635,6 +649,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -689,6 +704,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -753,6 +769,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -808,6 +825,7 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -844,6 +862,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -893,6 +912,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -957,6 +977,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1012,6 +1033,7 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1048,6 +1070,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1097,6 +1120,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1161,6 +1185,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ); asm volatile ( "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1295,6 +1320,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" @@ -1329,6 +1355,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "pcmpeqb %%xmm5,%%xmm5 \n" @@ -1364,6 +1391,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "punpcklbw %%xmm1,%%xmm2 \n" @@ -1398,6 +1426,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" @@ -1432,6 +1461,7 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "pcmpeqb %%xmm5,%%xmm5 \n" @@ -1467,6 +1497,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" YUVTORGB "punpcklbw %%xmm1,%%xmm2 \n" @@ -1501,6 +1532,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "sub %1,%2 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movd (%1),%%xmm0 \n" "movd (%1,%2,1),%%xmm1 \n" @@ -1562,6 +1594,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, "mov $0x012a012a,%%eax \n" "movd %%eax,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" + ".p2align 4 \n" "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 "movq (%0),%%xmm0 \n" @@ -1607,6 +1640,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm5 \n" "lea -0x10(%0),%0 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0,%2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -1631,6 +1665,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( "lea -0x10(%0),%0 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0,%2),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1668,6 +1703,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "movdqa %4,%%xmm1 \n" "lea -16(%0,%3,2),%0 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "lea -16(%0),%0 \n" @@ -1695,6 +1731,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, void AddRow_SSE2(const uint8* src, uint16* dst, int width) { asm volatile ( "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm2 \n" "lea 0x10(%0),%0 \n" @@ -1725,6 +1762,7 @@ void AddRow_SSE2(const uint8* src, uint16* dst, int width) { void SubRow_SSE2(const uint8* src, uint16* dst, int width) { asm volatile ( "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm2 \n" "lea 0x10(%0),%0 \n" @@ -1758,6 +1796,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1833,6 +1872,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1861,6 +1901,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1900,6 +1941,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1930,6 +1972,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1965,6 +2008,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1993,6 +2037,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -2029,6 +2074,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2057,6 +2103,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2109,6 +2156,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pslld $0x18,%%xmm4 \n" // 8 pixel loop + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm3 \n" "movdqa %%xmm3,%%xmm0 \n" @@ -2184,6 +2232,7 @@ void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pslld $0x18,%%xmm4 \n" // 1 pixel loop + ".p2align 4 \n" "1: \n" "movd (%0),%%xmm3 \n" "lea 0x4(%0),%0 \n" @@ -2241,6 +2290,7 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pslld $0x18,%%xmm4 \n" // 8 pixel loop + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm3 \n" "movdqa %%xmm3,%%xmm0 \n" @@ -2313,6 +2363,7 @@ void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pslld $0x18,%%xmm4 \n" // 1 pixel loop + ".p2align 4 \n" "1: \n" "movd (%0),%%xmm3 \n" "lea 0x4(%0),%0 \n" @@ -2361,6 +2412,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "psrld $0x8,%%xmm5 \n" // 4 pixel loop + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -2415,6 +2467,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "movdqa %4,%%xmm5 \n" // 4 pixel loop + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n" @@ -2503,6 +2556,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "pslld $0x18,%%xmm4 \n" // 4 pixel loop + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movzb 0x3(%0),%3 \n" diff --git a/source/scale.cc b/source/scale.cc index 13ca2288f..044d2ebe8 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1457,8 +1457,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret @@ -1471,8 +1473,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop1 - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret @@ -1486,8 +1490,10 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop2 - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret @@ -1538,8 +1544,11 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 // duplicate last pixel to allow horizontal filtering + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 + pop edi pop esi ret @@ -1552,8 +1561,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop1 - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret @@ -1567,8 +1578,10 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, lea esi, [esi + 16] jg xloop2 - mov al, [esi + edi - 1] - mov [esi + edi], al + punpckhbw xmm0, xmm0 + pshufhw xmm0, xmm0, 0xff + punpckhqdq xmm0, xmm0 + movdqa [esi + edi], xmm0 pop edi pop esi ret @@ -1634,6 +1647,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1658,6 +1672,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1692,6 +1707,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1717,6 +1733,7 @@ static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1752,6 +1769,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1779,6 +1797,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0x8,%%xmm7 \n" "lea (%4,%4,2),%3 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1831,6 +1850,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlq $0x38,%%xmm5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -1860,6 +1880,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pxor %%xmm4,%%xmm4 \n" "sub $0x1,%5 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "mov %0,%3 \n" @@ -2284,8 +2305,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 1b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2297,8 +2320,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 2b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2311,8 +2336,10 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 3b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2361,8 +2388,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 1b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2374,8 +2403,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 2b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2388,8 +2419,10 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "lea 0x10(%esi),%esi \n" "jg 3b \n" - "mov -0x1(%esi,%edi,1),%al \n" - "mov %al,(%esi,%edi,1) \n" + "punpckhbw %xmm0,%xmm0 \n" + "pshufhw $0xff,%xmm0,%xmm0 \n" + "punpckhqdq %xmm0,%xmm0 \n" + "movdqa %xmm0,(%esi,%edi,1) \n" "pop %edi \n" "pop %esi \n" "ret \n" @@ -2401,6 +2434,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "lea (%3,%3,2),%%r10 \n" "pxor %%xmm7,%%xmm7 \n" + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -2461,6 +2495,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, "movdqa (%3),%%xmm3 \n" "movdqa (%4),%%xmm4 \n" "movdqa (%5),%%xmm5 \n" + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm2 \n" @@ -2496,6 +2531,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, "movdqa (%8),%%xmm6 \n" // _madd11 "movdqa (%9),%%xmm7 \n" // _round34 "movdqa (%10),%%xmm8 \n" // _madd21 + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%3),%%xmm1 \n" @@ -2553,6 +2589,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, "movdqa (%8),%%xmm6 \n" // _madd11 "movdqa (%9),%%xmm7 \n" // _round34 "movdqa (%10),%%xmm8 \n" // _madd21 + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%3,1),%%xmm1 \n" @@ -2609,6 +2646,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, asm volatile ( "movdqa (%3),%%xmm4 \n" "movdqa (%4),%%xmm5 \n" + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -2638,6 +2676,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, "movdqa (%5),%%xmm5 \n" "movdqa (%6),%%xmm6 \n" "pxor %%xmm7,%%xmm7 \n" + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%3,1),%%xmm2 \n" @@ -2695,6 +2734,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, "movdqa (%5),%%xmm5 \n" "movdqa (%6),%%xmm6 \n" "movdqa (%7),%%xmm7 \n" + ".p2align 4 \n" "1:" "movdqa (%0),%%xmm2 \n" "pavgb (%0,%3,1),%%xmm2 \n" @@ -2733,6 +2773,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, int dst_width, int source_y_fraction) { if (source_y_fraction == 0) { asm volatile ( + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2751,6 +2792,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, return; } else if (source_y_fraction == 128) { asm volatile ( + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "movdqa (%1,%3,1),%%xmm2 \n" @@ -2781,6 +2823,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "pxor %%xmm7,%%xmm7 \n" + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm2 \n" @@ -2824,6 +2867,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, int dst_width, int source_y_fraction) { if (source_y_fraction <= 1) { asm volatile ( + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2842,6 +2886,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, return; } else if (source_y_fraction == 128) { asm volatile ( + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "movdqa (%1,%3,1),%%xmm2 \n" @@ -2870,6 +2915,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, "movd %%eax,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" "1:" "movdqa (%1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm2 \n" @@ -2904,12 +2950,13 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, // CPU agnostic row functions static void ScaleRowDown2_C(const uint8* src_ptr, int, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width - 1; x += 2) { + uint8* dend = dst + dst_width - 1; + do { dst[0] = src_ptr[0]; dst[1] = src_ptr[2]; dst += 2; src_ptr += 4; - } + } while (dst < dend); if (dst_width & 1) { dst[0] = src_ptr[0]; } @@ -2917,28 +2964,30 @@ static void ScaleRowDown2_C(const uint8* src_ptr, int, void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = (src_ptr[0] + src_ptr[1] + - src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; - dst[1] = (src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2) >> 2; + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + uint8* dend = dst + dst_width - 1; + do { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; dst += 2; - src_ptr += 4; - } + s += 4; + t += 4; + } while (dst < dend); if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + - src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; } } static void ScaleRowDown4_C(const uint8* src_ptr, int, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width - 1; x += 2) { + uint8* dend = dst + dst_width - 1; + do { dst[0] = src_ptr[0]; dst[1] = src_ptr[4]; dst += 2; src_ptr += 8; - } + } while (dst < dend); if (dst_width & 1) { dst[0] = src_ptr[0]; } @@ -2946,34 +2995,36 @@ static void ScaleRowDown4_C(const uint8* src_ptr, int, static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width - 1; x += 2) { + intptr_t stride = src_stride; + uint8* dend = dst + dst_width - 1; + do { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 4] + src_ptr[src_stride + 5] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] + - src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] + - src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; - } + } while (dst < dend); if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + - src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + - src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + - src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; } } @@ -2985,12 +3036,13 @@ static const int kMaxRow12 = kMaxOutputWidth * 2; static void ScaleRowDown8_C(const uint8* src_ptr, int, uint8* dst, int dst_width) { - for (int x = 0; x < dst_width - 1; x += 2) { + uint8* dend = dst + dst_width - 1; + do { dst[0] = src_ptr[0]; dst[1] = src_ptr[8]; dst += 2; src_ptr += 16; - } + } while (dst < dend); if (dst_width & 1) { dst[0] = src_ptr[0]; } @@ -3026,9 +3078,9 @@ static void ScaleRowDown34_C(const uint8* src_ptr, int, static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = d + dst_width; const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -3049,9 +3101,9 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = d + dst_width; const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; + uint8* dend = d + dst_width; do { uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -3073,8 +3125,8 @@ static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = dst_ptr + dst_width; const uint8* s = src_ptr; + uint8* dend = dst_ptr + dst_width; do { dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; @@ -3150,20 +3202,21 @@ static void ScaleRowDown38_C(const uint8* src_ptr, int, static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; for (int i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + - src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * (65536 / 9) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + - src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * (65536 / 9) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + - src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * (65536 / 6) >> 16; src_ptr += 8; dst_ptr += 3; @@ -3174,15 +3227,16 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; for (int i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + - src_ptr[src_stride + 2]) * (65536 / 6) >> 16; + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + - src_ptr[src_stride + 5]) * (65536 / 6) >> 16; + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * + src_ptr[stride + 6] + src_ptr[stride + 7]) * (65536 / 4) >> 16; src_ptr += 8; dst_ptr += 3;