diff --git a/README.chromium b/README.chromium index 980a40c78..a78bd81f9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 745 +Version: 746 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 92c828846..ad6c50761 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -38,8 +38,17 @@ extern "C" { // The following are available on all x86 platforms, including NaCL: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_ARGBBLENDROW_SSSE3 +// Effects: +#define HAS_ARGBADDROW_SSE2 #define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 + +// Conversions: +#define HAS_FIXEDDIV_X86 + #endif // The following are available on all x86 platforms except NaCL x64: @@ -47,7 +56,7 @@ extern "C" { (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !(defined(__native_client__) && defined(__x86_64__)) -// Conversions. +// Conversions: #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 @@ -110,19 +119,14 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_FIXEDDIV -// Effects -#define HAS_ARGBADDROW_SSE2 +// Effects: #define HAS_ARGBAFFINEROW_SSE2 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3 -#define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 @@ -134,12 +138,12 @@ extern "C" { #define HAS_SOBELYROW_SSSE3 #endif -// The following are Windows only. +// The following are Windows only: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_RGBCOLORTABLEROW_X86 -// Visual C 2012 required for AVX2. +// Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 @@ -157,7 +161,7 @@ extern "C" { #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -// Effects +// Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 @@ -167,7 +171,7 @@ extern "C" { #endif #endif -// The following are Yasm x86 only. +// The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) (defined(_M_IX86) || defined(_M_X64) || \ @@ -194,7 +198,7 @@ extern "C" { #endif #endif -// The following are available on Neon platforms +// The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON @@ -267,7 +271,7 @@ extern "C" { #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -// Effects +// Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBBLENDROW_NEON @@ -286,7 +290,7 @@ extern "C" { #define HAS_INTERPOLATEROW_NEON #endif -// The following are available on Mips platforms +// The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) @@ -1534,8 +1538,9 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); -#ifdef HAS_FIXEDDIV -int FixedDiv(int num, int div); +int FixedDiv_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 #else #define FixedDiv FixedDiv_C #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5e853e981..3a78b51c6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 745 +#define LIBYUV_VERSION 746 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index f8b358309..93935b1f3 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -30,7 +30,9 @@ extern "C" { uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); // This module is for Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ +#if !defined(LIBYUV_DISABLE_X86) && \ + !(defined(__native_client__) && defined(__x86_64__)) && \ + (defined(_M_IX86) || \ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) #define HAS_HASHDJB2_SSE41 @@ -73,8 +75,9 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #endif -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ - defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + !(defined(__native_client__) && defined(__x86_64__)) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); #endif diff --git a/source/compare_posix.cc b/source/compare_posix.cc index 61b012364..b97a6eaa5 100644 --- a/source/compare_posix.cc +++ b/source/compare_posix.cc @@ -16,7 +16,9 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + !(defined(__native_client__) && defined(__x86_64__)) && \ + (defined(__x86_64__) || defined(__i386__)) uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; @@ -65,6 +67,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { #endif // defined(__x86_64__) || defined(__i386__) #if !defined(LIBYUV_DISABLE_X86) && \ + !(defined(__native_client__) && defined(__x86_64__)) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) #define HAS_HASHDJB2_SSE41 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 diff --git a/source/row_common.cc b/source/row_common.cc index 60af38608..67ffc96b6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1904,7 +1904,7 @@ void I422ToUYVYRow_C(const uint8* src_y, } } -#if !defined(LIBYUV_DISABLE_X86) +#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower. // TODO(fbarchard): Handle width > kMaxStride here instead of calling code. #if defined(__x86_64__) || defined(__i386__) @@ -2001,7 +2001,6 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); } - #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) #undef clamp0 diff --git a/source/row_posix.cc b/source/row_posix.cc index 239731a17..642a0a71e 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3027,6 +3027,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { } #endif // HAS_COPYROW_X86 +#ifdef HAS_COPYROW_ERMS // Unaligned Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = static_cast(width); @@ -3039,6 +3040,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { : "memory", "cc" ); } +#endif // HAS_COPYROW_ERMS #ifdef HAS_SETROW_X86 void SetRow_X86(uint8* dst, uint32 v32, int width) { @@ -4167,14 +4169,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( "movd %3,%%xmm2 \n" - "sub %0,%1 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" @@ -4184,8 +4186,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,"MEMACCESS(1)" \n" + "lea "MEMLEA(0x10,1)",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4205,14 +4207,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%2 \n" // 4 pixel loop. ".p2align 4 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm2 \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqu "MEMACCESS(1)",%%xmm2 \n" + "lea "MEMLEA(0x10,1)",%1 \n" "movdqu %%xmm0,%%xmm1 \n" "movdqu %%xmm2,%%xmm3 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4223,8 +4225,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0,"MEMACCESS(2)" \n" + "lea "MEMLEA(0x10,2)",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4244,18 +4246,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - // 4 pixel loop. ".p2align 4 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm1 \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqu "MEMACCESS(1)",%%xmm1 \n" + "lea "MEMLEA(0x10,1)",%1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0,"MEMACCESS(2)" \n" + "lea "MEMLEA(0x10,2)",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4275,18 +4276,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - // 4 pixel loop. ".p2align 4 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm1 \n" + "movdqu "MEMACCESS(0)",%%xmm0 \n" + "lea "MEMLEA(0x10,0)",%0 \n" + "movdqu "MEMACCESS(1)",%%xmm1 \n" + "lea "MEMLEA(0x10,1)",%1 \n" "psubusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0,"MEMACCESS(2)" \n" + "lea "MEMLEA(0x10,2)",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4793,6 +4793,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 +#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -4895,6 +4896,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #endif ); } +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -5009,6 +5011,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 +#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -5111,6 +5114,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #endif ); } +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -5225,6 +5229,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 +#ifdef HAS_HALFROW_SSE2 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( @@ -5247,7 +5252,9 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, #endif ); } +#endif // HAS_HALFROW_SSE2 +#ifdef HAS_ARGBTOBAYERROW_SSSE3 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( @@ -5275,7 +5282,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, #endif ); } +#endif // HAS_ARGBTOBAYERROW_SSSE3 +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { @@ -5330,7 +5339,9 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, #endif ); } +#endif // HAS_ARGBSHUFFLEROW_SSSE3 +#ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -5365,7 +5376,9 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, #endif ); } +#endif // HAS_I422TOYUY2ROW_SSE2 +#ifdef HAS_I422TOUYVYROW_SSE2 void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -5400,9 +5413,11 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, #endif ); } +#endif // HAS_I422TOUYVYROW_SSE2 +#ifdef HAS_FIXEDDIV_X86 // Divide num by div and return as 16.16 fixed point result. -int FixedDiv(int num, int div) { +int FixedDiv_X86(int num, int div) { asm volatile ( "cdq \n" "shld $0x10,%%eax,%%edx \n" @@ -5415,6 +5430,7 @@ int FixedDiv(int num, int div) { ); return num; } +#endif // HAS_FIXEDDIV_X86 #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index a255293b0..25c07b3f8 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5239,13 +5239,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value - sub edx, eax punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 align 16 convertloop: movdqa xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 punpckhbw xmm1, xmm1 // next 2 @@ -5255,8 +5255,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] + movdqa [edx], xmm0 + lea edx, [edx + 16] jg convertloop ret @@ -5276,25 +5276,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 - sub esi, eax - sub edx, eax align 16 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] packuswb xmm0, xmm1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop pop esi @@ -5315,8 +5315,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax sub ecx, 4 jl convertloop49 @@ -5324,11 +5322,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, align 16 convertloop4: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] paddusb xmm0, xmm1 // src_argb0 + src_argb1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jge convertloop4 convertloop49: @@ -5337,11 +5337,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, convertloop1: movd xmm0, [eax] // read 1 pixels from src_argb0 - movd xmm1, [eax + esi] // read 1 pixels from src_argb1 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] paddusb xmm0, xmm1 // src_argb0 + src_argb1 sub ecx, 1 - movd [eax + edx], xmm0 - lea eax, [eax + 4] + movd [edx], xmm0 + lea edx, [edx + 4] jge convertloop1 convertloop19: @@ -5362,17 +5364,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax align 16 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] psubusb xmm0, xmm1 // src_argb0 - src_argb1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop pop esi @@ -5392,14 +5394,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 - sub esi, eax - sub edx, eax + vpxor ymm5, ymm5, ymm5 // constant 0 align 16 convertloop: vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 - vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] vpunpcklbw ymm0, ymm1, ymm1 // low 4 vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 @@ -5407,8 +5409,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -5430,15 +5432,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax align 16 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 - vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1 - vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -5460,15 +5462,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax align 16 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 - vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1 - vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -6646,9 +6648,10 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } } +#ifdef HAS_FIXEDDIV_X86 // Divide num by div and return as 16.16 fixed point result. __declspec(naked) __declspec(align(16)) -int FixedDiv(int num, int div) { +int FixedDiv_X86(int num, int div) { __asm { mov eax, [esp + 4] // num cdq // extend num to 64 bits @@ -6658,6 +6661,7 @@ int FixedDiv(int num, int div) { ret } } +#endif // HAS_FIXEDDIV_X86 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus