From c7161d1c36fb43f8ed0ddf25842f9894be1940d4 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Sun, 12 Apr 2015 23:54:26 +0000 Subject: [PATCH] Remove code alignment declspec from Visual C versions for vs2014 compatibility. BUG=422 TESTED=local vs2013 build still passes. Review URL: https://webrtc-codereview.appspot.com/45959004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1365 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/compare_win.cc | 8 +- source/mjpeg_validate.cc | 2 +- source/rotate.cc | 4 +- source/row_win.cc | 256 +++++++++++++++++++-------------------- source/scale_win.cc | 54 ++++----- 7 files changed, 162 insertions(+), 166 deletions(-) diff --git a/README.chromium b/README.chromium index 52c0d9f43..b2f917e5f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1364 +Version: 1365 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c87ce0a91..30b658db4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1364 +#define LIBYUV_VERSION 1365 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare_win.cc b/source/compare_win.cc index 0395e6565..603849374 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -19,7 +19,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ defined(_MSC_VER) && !defined(__clang__) -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { __asm { mov eax, [esp + 4] // src_a @@ -60,7 +60,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable: 4752) -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { __asm { mov eax, [esp + 4] // src_a @@ -134,7 +134,7 @@ static uvec32 kHashMul3 = { #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ _asm _emit 0x40 _asm _emit reg -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src @@ -185,7 +185,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src diff --git a/source/mjpeg_validate.cc b/source/mjpeg_validate.cc index 40ce2f787..8edfbe1e7 100644 --- a/source/mjpeg_validate.cc +++ b/source/mjpeg_validate.cc @@ -23,7 +23,7 @@ extern "C" { #ifdef ENABLE_SCASB // Multiple of 1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { __asm { mov edx, edi diff --git a/source/rotate.cc b/source/rotate.cc index bfab546a3..4d8c40cca 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -73,7 +73,7 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, #if !defined(LIBYUV_DISABLE_X86) && \ defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__) #define HAS_TRANSPOSE_WX8_SSSE3 -__declspec(naked) __declspec(align(16)) +__declspec(naked) static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { __asm { @@ -165,7 +165,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, } #define HAS_TRANSPOSE_UVWX8_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, diff --git a/source/row_win.cc b/source/row_win.cc index 3bfa97431..1c9f13cc1 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -147,8 +147,6 @@ static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { // 64 bit #if defined(_M_X64) - -__declspec(align(16)) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -198,10 +196,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, width -= 8; } } - // 32 bit #else // defined(_M_X64) - #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. @@ -324,7 +320,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = { }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_y @@ -353,7 +349,7 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_y @@ -383,7 +379,7 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { } #endif // HAS_J400TOARGBROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_rgb24 @@ -421,7 +417,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { __asm { @@ -467,7 +463,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix) { __asm { @@ -523,7 +519,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix) { __asm { @@ -574,7 +570,7 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, int pix) { __asm { @@ -624,7 +620,7 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, int pix) { __asm { @@ -660,7 +656,7 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix) { __asm { @@ -713,7 +709,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, } // 18 instructions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix) { __asm { @@ -751,7 +747,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -789,7 +785,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -828,7 +824,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } // 4 pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -866,7 +862,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } // 8 pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int pix) { __asm { @@ -912,7 +908,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int pix) { __asm { @@ -955,7 +951,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -996,7 +992,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -1026,7 +1022,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -1063,7 +1059,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -1103,7 +1099,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -1134,7 +1130,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1169,7 +1165,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1288,7 +1284,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1321,7 +1317,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1354,7 +1350,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1387,7 +1383,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1457,7 +1453,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1594,7 +1590,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1651,7 +1647,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUV422Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1709,7 +1705,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1779,7 +1775,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1849,7 +1845,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -2005,7 +2001,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2041,7 +2037,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_J422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void J422ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2077,7 +2073,7 @@ void J422ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I444ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2113,7 +2109,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_I411TOARGBROW_AVX2 // 16 pixels // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I411ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2149,7 +2145,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV12ToARGBRow_AVX2(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2180,7 +2176,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV21ToARGBRow_AVX2(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2212,7 +2208,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToBGRARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2258,7 +2254,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGBARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2304,7 +2300,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToABGRRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2542,7 +2538,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2575,7 +2571,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2609,7 +2605,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRAWRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2643,7 +2639,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2682,7 +2678,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2716,7 +2712,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // JPeg color space version of I422ToARGB // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void J422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2750,7 +2746,7 @@ void J422ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2785,7 +2781,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2813,7 +2809,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV21ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2839,7 +2835,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2869,7 +2865,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2900,7 +2896,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGBARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2934,7 +2930,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width) { @@ -2982,7 +2978,7 @@ void I400ToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* rgb_buf, int width) { @@ -3037,7 +3033,7 @@ static const uvec8 kShuffleMirror = { }; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3058,7 +3054,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3081,7 +3077,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3112,7 +3108,7 @@ static const uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -3142,7 +3138,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3169,7 +3165,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = { 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3190,7 +3186,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi @@ -3228,7 +3224,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi @@ -3266,7 +3262,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { __asm { @@ -3297,7 +3293,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { __asm { @@ -3331,7 +3327,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { __asm { mov eax, [esp + 4] // src @@ -3354,7 +3350,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #ifdef HAS_COPYROW_AVX // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { __asm { mov eax, [esp + 4] // src @@ -3378,7 +3374,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { mov eax, esi @@ -3395,7 +3391,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3431,7 +3427,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3460,7 +3456,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3498,7 +3494,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3530,7 +3526,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_SETROW_X86 // Write 'count' bytes using an 8 bit value repeated. // Count should be multiple of 4. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { __asm { movzx eax, byte ptr [esp + 8] // v8 @@ -3547,7 +3543,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { } // Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) { __asm { mov edx, edi @@ -3561,7 +3557,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) { } // Write 'count' 32 bit values. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { __asm { mov edx, edi @@ -3576,7 +3572,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -3603,7 +3599,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3647,7 +3643,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3686,7 +3682,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -3711,7 +3707,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3755,7 +3751,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3796,7 +3792,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -3821,7 +3817,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3864,7 +3860,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3900,7 +3896,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -3923,7 +3919,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3966,7 +3962,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -4005,7 +4001,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4139,7 +4135,7 @@ static const uvec8 kShuffleAlpha = { // pshufb xmm3, kShuffleAlpha // alpha // Blend 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4255,7 +4251,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4304,7 +4300,7 @@ static const uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4348,7 +4344,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { static const uvec8 kShuffleAlpha_AVX2 = { 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -4385,7 +4381,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -4439,7 +4435,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = { // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -4473,7 +4469,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, } } #else // USE_GATHER -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -4540,7 +4536,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -4599,7 +4595,7 @@ static const vec8 kARGBToSepiaR = { }; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ @@ -4656,7 +4652,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { __asm { @@ -4717,7 +4713,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { __asm { @@ -4762,7 +4758,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { __asm { @@ -4796,7 +4792,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4835,7 +4831,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4883,7 +4879,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4912,7 +4908,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4949,7 +4945,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4978,7 +4974,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -5010,7 +5006,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { __asm { @@ -5066,7 +5062,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { __asm { @@ -5119,7 +5115,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { __asm { @@ -5166,7 +5162,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { __asm { @@ -5199,7 +5195,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { __asm { @@ -5486,7 +5482,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) __declspec(align(16)) +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width) { @@ -5571,7 +5567,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5668,7 +5664,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, #endif // HAS_INTERPOLATEROW_AVX2 // Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5769,7 +5765,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5876,7 +5872,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, #endif // HAS_INTERPOLATEROW_SSE2 // Specialized ARGB to Bayer that just isolates G channel. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { __asm { @@ -5906,7 +5902,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -5932,7 +5928,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -5960,7 +5956,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -6082,7 +6078,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -6119,7 +6115,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -6157,7 +6153,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { @@ -6216,7 +6212,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { @@ -6256,7 +6252,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { __asm { @@ -6290,7 +6286,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { __asm { push esi @@ -6321,7 +6317,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff) { diff --git a/source/scale_win.cc b/source/scale_win.cc index a9a68b668..63c66d69f 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -95,7 +95,7 @@ static uvec16 kScaleAb2 = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; // Reads 32 pixels, throws half away and writes 16 pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -121,7 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -157,7 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -200,7 +200,7 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -228,7 +228,7 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x1 rectangle to 32x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -265,7 +265,7 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 64x2 rectangle to 32x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -307,7 +307,7 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -338,7 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -401,7 +401,7 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -448,7 +448,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -505,7 +505,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -567,7 +567,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -598,7 +598,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -663,7 +663,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -709,7 +709,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, // Reads 16xN bytes and produces 16 shorts at a time. // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { @@ -775,7 +775,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // when drmemory bug fixed. // https://code.google.com/p/drmemory/issues/detail?id=1396 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { @@ -852,7 +852,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { @@ -877,7 +877,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -902,7 +902,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -930,7 +930,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -964,7 +964,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { @@ -1000,7 +1000,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -1048,7 +1048,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1139,7 +1139,7 @@ static uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1210,7 +1210,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1235,7 +1235,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) __declspec(align(16)) +__declspec(naked) int FixedDiv_X86(int num, int div) { __asm { mov eax, [esp + 4] // num @@ -1248,7 +1248,7 @@ int FixedDiv_X86(int num, int div) { } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) __declspec(align(16)) +__declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { mov eax, [esp + 4] // num