From d2f4413d29d15b94d971630ba555dd0cd8fcc8c2 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 4 Apr 2012 21:53:27 +0000 Subject: [PATCH] Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2 BUG=29 TEST=none Review URL: https://webrtc-codereview.appspot.com/469005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 21 +- include/libyuv/rotate.h | 2 +- include/libyuv/version.h | 2 +- source/compare.cc | 53 ++-- source/convert.cc | 19 +- source/convert_from.cc | 8 +- source/format_conversion.cc | 7 +- source/planar_functions.cc | 124 +++------- source/rotate.cc | 16 +- source/rotate_neon.cc | 2 +- source/row.h | 37 ++- source/row_common.cc | 65 +---- source/row_neon.cc | 2 +- source/row_posix.cc | 179 +------------- source/row_win.cc | 390 +++++------------------------- source/scale.cc | 51 ++-- 17 files changed, 221 insertions(+), 759 deletions(-) diff --git a/README.chromium b/README.chromium index b83e57530..3af24f5a3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 233 +Version: 234 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 87de9b6b7..d7fd3e10e 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Alpha Blend ARGB row of pixels. -void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width); +typedef void (*ARGBBlendRow)(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, int width); -// Alpha Blend 2 rows of ARGB pixels and store to destination. -void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); +// Get function to Alpha Blend ARGB pixels and store to destination. +ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width); -// Alpha Blend ARGB. -int ARGBBlend(const uint8* src_argb, int src_stride_argb, +// Alpha Blend ARGB images and store to destination. +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, + const uint8* src_argb1, int src_stride_argb1, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Alpha Blend 2 ARGB images and store to destination. -int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, - const uint8* src_argb1, int src_stride_argb1, - uint8* dst_argb, int dst_stride_argb, - int width, int height); - // Convert I422 to YUY2. int I422ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h index f8d2f57db..773290701 100644 --- a/include/libyuv/rotate.h +++ b/include/libyuv/rotate.h @@ -20,7 +20,7 @@ extern "C" { // Supported rotation enum RotationMode { - kRotate0 = 0, // No rotation + kRotate0 = 0, // No rotation kRotate90 = 90, // Rotate 90 degrees clockwise kRotate180 = 180, // Rotate 180 degrees kRotate270 = 270, // Rotate 270 degrees clockwise diff --git a/include/libyuv/version.h b/include/libyuv/version.h index aced5e45c..cdae68054 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define INCLUDE_LIBYUV_VERSION 233 +#define LIBYUV_VERSION 234 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare.cc b/source/compare.cc index b1b88769f..5fccd3930 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -25,18 +25,37 @@ namespace libyuv { extern "C" { #endif -// hash seed of 5381 recommended. -uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { +// Internal C version of HashDjb2 with int sized count for efficiency. +static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { uint32 hash = seed; - if (count > 0) { - do { - hash = hash * 33 + *src++; - } while (--count); + for (int i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; } return hash; } -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +// hash seed of 5381 recommended. +uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { + const int kBlockSize = 1 << 15; // 32768; + while (count >= static_cast(kBlockSize)) { + seed = HashDjb2_C(src, kBlockSize, seed); + src += kBlockSize; + count -= kBlockSize; + } + int remainder = static_cast(count) & ~15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + src += remainder; + count -= remainder; + } + remainder = static_cast(count) & 15; + if (remainder) { + seed = HashDjb2_C(src, remainder, seed); + } + return seed; +} + +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SUMSQUAREERROR_NEON static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, @@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, return sse; } -#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SUMSQUAREERROR_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { __asm { @@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, movdqa xmm2, [eax + edx] lea eax, [eax + 16] sub ecx, 16 - movdqa xmm3, xmm1 + movdqa xmm3, xmm1 // abs trick psubusb xmm1, xmm2 psubusb xmm2, xmm3 por xmm1, xmm2 @@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { @@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { uint32 sse = 0u; - for (int x = 0; x < count; ++x) { - int diff = src_a[0] - src_b[0]; + for (int i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; sse += static_cast(diff * diff); - src_a += 1; - src_b += 1; } return sse; } @@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, #elif defined(HAS_SUMSQUAREERROR_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } #endif @@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, SumSquareError = SumSquareError_NEON; } #elif defined(HAS_SUMSQUAREERROR_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) && + IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) { SumSquareError = SumSquareError_SSE2; } #endif diff --git a/source/convert.cc b/source/convert.cc index e07970d80..0b1f03c74 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y, return 0; } -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_HALFROW_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { __asm { @@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_HALFROW_SSE2 static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { @@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y, // Blends 32x2 pixels to 16x1 // source in scale.cc -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SCALEROWDOWN2_NEON void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width); -#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \ - !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) + void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); #endif @@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420, width, height); } -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SPLITYUY2_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SPLITYUY2_SSE2 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { diff --git a/source/convert_from.cc b/source/convert_from.cc index 9a0d32ad9..efe58dd82 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y, // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_I42XTOYUY2ROW_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void I42xToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y, } #define HAS_I42XTOUYVYROW_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void I42xToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y, ret } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_I42XTOYUY2ROW_SSE2 static void I42xToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 46d7e7e23..1cdf709e4 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -24,9 +24,9 @@ extern "C" { // and vst would select which 2 components to write. The low level would need // to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBTOBAYERROW_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { __asm { @@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, mov ecx, [esp + 16] // pix pshufd xmm5, xmm5, 0 + align 16 wloop: movdqa xmm0, [eax] lea eax, [eax + 16] @@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_ARGBTOBAYERROW_SSSE3 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 866fcb4fd..2bc3e3fe2 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, return 0; } -// Alpha Blend ARGB -void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) { -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow_SSSE3(src_argb, dst_argb, width); - return; - } -#endif +// Get a blender that optimized for the CPU, alignment and pixel count. +// As there are 6 blenders to choose from, the caller should try to use +// the same blend function for all pixels if possible. +ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) { + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = ARGBBlendRow_C; #if defined(HAS_ARGBBLENDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlendRow_SSE2(src_argb, dst_argb, width); - return; - } -#endif - ARGBBlendRow_C(src_argb, dst_argb, width); -} - -// Alpha Blend 2 rows of ARGB pixels and store to destination. -void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width); - return; - } -#endif -#if defined(HAS_ARGBBLENDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width); - return; - } -#endif - ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width); -} - -// Alpha Blend ARGB -// TODO(fbarchard): Call 3 pointer low levels to reduce code size. -int ARGBBlend(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (!src_argb || !dst_argb || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - - void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) = - ARGBBlendRow_C; -#if defined(HAS_ARGBBLENDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlendRow = ARGBBlendRow_SSE2; - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBBlendRow = ARGBBlendRow_Aligned_SSE2; + ARGBBlendRow = ARGBBlendRow1_SSE2; + if (width >= 4) { + ARGBBlendRow = ARGBBlendRow_Any_SSE2; + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBBlendRow = ARGBBlendRow_Aligned_SSE2; + } } } #endif #if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + ARGBBlendRow = ARGBBlendRow_Any_SSSE3; if (IS_ALIGNED(width, 4) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3; } } #endif - - for (int y = 0; y < height; ++y) { - ARGBBlendRow(src_argb, dst_argb, width); - src_argb += src_stride_argb; - dst_argb += dst_stride_argb; - } - return 0; + return ARGBBlendRow; } // Alpha Blend 2 ARGB images and store to destination. -int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, +int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, uint8* dst_argb, int dst_stride_argb, int width, int height) { @@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - - void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width) = ARGBBlend2Row_C; -#if defined(HAS_ARGBBLENDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlend2Row = ARGBBlend2Row_SSE2; - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2; - } - } -#endif -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBBlend2Row = ARGBBlend2Row_SSSE3; - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3; - } - } -#endif + void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width) = + GetARGBBlend(dst_argb, dst_stride_argb, width); for (int y = 0; y < height; ++y) { - ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width); + ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; @@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // SetRow8 writes 'count' bytes using a 32 bit value repeated // SetRow32 writes 'count' words using a 32 bit value repeated -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SETROW_NEON static void SetRow8_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( @@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width, } } -#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SETROW_X86 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void SetRow8_X86(uint8* dst, uint32 v32, int count) { __asm { mov edx, edi @@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void SetRows32_X86(uint8* dst, uint32 v32, int width, int dst_stride, int height) { __asm { @@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SETROW_X86 static void SetRow8_X86(uint8* dst, uint32 v32, int width) { size_t width_tmp = static_cast(width); @@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, return 0; } +// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested. // Draw a rectangle into ARGB int ARGBRect(uint8* dst_argb, int dst_stride_argb, int dst_x, int dst_y, @@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; #if defined(HAS_SETROW_X86) SetRows32_X86(dst, value, width, dst_stride_argb, height); -#elif defined(HAS_SETROW_NEON) +#else +#if defined(HAS_SETROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) { SetRows32_NEON(dst, value, width, dst_stride_argb, height); return 0; } +#endif SetRows32_C(dst, value, width, dst_stride_argb, height); #endif return 0; diff --git a/source/rotate.cc b/source/rotate.cc index f5f9075c3..a029a17bc 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -21,8 +21,8 @@ namespace libyuv { extern "C" { #endif -#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #if defined(__APPLE__) && defined(__i386__) #define DECLARE_FUNCTION(name) \ ".text \n" \ @@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, int width); #endif -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_TRANSPOSE_WX8_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { __asm { @@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, } #define HAS_TRANSPOSE_UVWX8_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, @@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ret } } -#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { @@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ); } -#if defined (__i386__) +#if !defined(YUV_DISABLE_ASM) && defined (__i386__) #define HAS_TRANSPOSE_UVWX8_SSE2 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, @@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "pop %ebx \n" "ret \n" ); -#elif defined(__x86_64__) +#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 0240fe12f..7ff993617 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -17,7 +17,7 @@ namespace libyuv { extern "C" { #endif -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) static const uvec8 vtbl_4x4_transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; diff --git a/source/row.h b/source/row.h index c70160025..4ed17a096 100644 --- a/source/row.h +++ b/source/row.h @@ -18,6 +18,7 @@ namespace libyuv { extern "C" { #endif +// TODO(fbarchard): Remove kMaxStride #define kMaxStride (2560 * 4) #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) @@ -26,8 +27,9 @@ extern "C" { #endif // The following are available on all x86 platforms -#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) + #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 @@ -66,7 +68,7 @@ extern "C" { #endif // The following are available on Neon platforms -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_MIRRORROW_NEON #define HAS_MIRRORROWUV_NEON #define HAS_SPLITUV_NEON @@ -78,7 +80,7 @@ extern "C" { // The following are only available on Win32 // TODO(fbarchard): Port to GCC -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBBLENDROW_SSSE3 #endif @@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - int width); -void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, - int width); -void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); - -// ARGB preattenuated alpha blend with 2 sources and a destination. -void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); -void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); -void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); -void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width); +void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width); // 'Any' functions handle any size and alignment. void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, diff --git a/source/row_common.cc b/source/row_common.cc index d2f17ef30..32e2db95a 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } #define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f -void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { - for (int x = 0; x < width - 1; x += 2) { - uint32 a = src_argb[3]; - if (a) { - if (a < 255) { - const uint32 fb = src_argb[0]; - const uint32 fg = src_argb[1]; - const uint32 fr = src_argb[2]; - const uint32 bb = dst_argb[0]; - const uint32 bg = dst_argb[1]; - const uint32 br = dst_argb[2]; - dst_argb[0] = BLENDER(fb, bb, a); - dst_argb[1] = BLENDER(fg, bg, a); - dst_argb[2] = BLENDER(fr, br, a); - dst_argb[3] = 255u; - } else { - *reinterpret_cast(dst_argb) = - *reinterpret_cast(src_argb); - } - } - a = src_argb[4 + 3]; - if (a) { - if (a < 255) { - const uint32 fb = src_argb[4 + 0]; - const uint32 fg = src_argb[4 + 1]; - const uint32 fr = src_argb[4 + 2]; - const uint32 bb = dst_argb[4 + 0]; - const uint32 bg = dst_argb[4 + 1]; - const uint32 br = dst_argb[4 + 2]; - dst_argb[4 + 0] = BLENDER(fb, bb, a); - dst_argb[4 + 1] = BLENDER(fg, bg, a); - dst_argb[4 + 2] = BLENDER(fr, br, a); - dst_argb[4 + 3] = 255u; - } else { - *reinterpret_cast(dst_argb + 4) = - *reinterpret_cast(src_argb + 4); - } - } - src_argb += 8; - dst_argb += 8; - } - - if (width & 1) { - const uint32 a = src_argb[3]; - if (a) { - if (a < 255) { - const uint32 fb = src_argb[0]; - const uint32 fg = src_argb[1]; - const uint32 fr = src_argb[2]; - const uint32 bb = dst_argb[0]; - const uint32 bg = dst_argb[1]; - const uint32 br = dst_argb[2]; - dst_argb[0] = BLENDER(fb, bb, a); - dst_argb[1] = BLENDER(fg, bg, a); - dst_argb[2] = BLENDER(fr, br, a); - dst_argb[3] = 255u; - } else { - *reinterpret_cast(dst_argb) = - *reinterpret_cast(src_argb); - } - } - } -} // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. -void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { for (int x = 0; x < width - 1; x += 2) { uint32 a = src_argb0[3]; diff --git a/source/row_neon.cc b/source/row_neon.cc index 2c68492e3..ba22c8073 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -16,7 +16,7 @@ extern "C" { #endif // This module is for GCC Neon -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define YUVTORGB \ "vld1.u8 {d0}, [%0]! \n" \ diff --git a/source/row_posix.cc b/source/row_posix.cc index e7cfb011a..f8979ace0 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -18,7 +18,7 @@ extern "C" { #endif // This module is for GCC x86 and x64 -#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) // GCC 4.2 on OSX has link error when passing static or const to inline. // TODO(fbarchard): Use static const when gcc 4.2 support is dropped. @@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time // Destination aligned to 16 bytes, multiple of 4 pixels -void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, - int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - - // 8 pixel loop - "1: \n" - "movdqu (%0),%%xmm3 \n" // first 4 pixels - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqa (%1),%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqa (%1),%%xmm1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "movdqu 0x10(%0),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1) \n" - "jle 9f \n" - "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels - "pxor %%xmm4,%%xmm3 \n" - "movdqa 0x10(%1),%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqa 0x10(%1),%%xmm1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "jg 1b \n" - "9: \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -// Blend 1 pixel at a time, unaligned -void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - - // 1 pixel loop - "1: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "sub $0x1,%2 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb, dst_argb, count); - src_argb += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels - if (width & ~3) { - ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb, dst_argb, width); - } -} - -#endif // HAS_ARGBBLENDROW_SSE2 - - - - - - - - -#ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time -// Destination aligned to 16 bytes, multiple of 4 pixels -void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm7,%%xmm7 \n" @@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, } // Blend 1 pixel at a time, unaligned -void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm7,%%xmm7 \n" @@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, ); } -void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { +void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { // Do 1 to 3 pixels to get destination aligned. if ((uintptr_t)(dst_argb) & 15) { int count = width; if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { count = (-(intptr_t)(dst_argb) >> 2) & 3; } - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); src_argb0 += count * 4; src_argb1 += count * 4; dst_argb += count * 4; @@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, } // Do multiple of 4 pixels if (width & ~3) { - ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); + ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); } // Do remaining 1 to 3 pixels if (width & 3) { @@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, src_argb1 += (width & ~3) * 4; dst_argb += (width & ~3) * 4; width &= 3; - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); } } #endif // HAS_ARGBBLENDROW_SSE2 - - - - - - - - #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index ada7788c7..c7c553774 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -16,7 +16,7 @@ extern "C" { #endif // This module is for Visual C x86 -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -99,7 +99,7 @@ static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_y @@ -127,7 +127,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_abgr @@ -148,7 +148,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_bgra @@ -169,7 +169,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_rgb24 @@ -208,7 +208,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { __asm { @@ -255,7 +255,7 @@ __asm { // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions -__declspec(naked) +__declspec(naked) __declspec(align(16)) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix) { __asm { @@ -306,7 +306,7 @@ __asm { } // 24 instructions -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix) { __asm { @@ -360,7 +360,7 @@ __asm { } // 18 instructions -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix) { __asm { @@ -399,7 +399,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -438,7 +438,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -477,7 +477,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -516,7 +516,7 @@ __asm { } // TODO(fbarchard): Improve sign extension/packing -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -558,7 +558,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -589,7 +589,7 @@ __asm { } // Convert 16 ARGB pixels (64 bytes) to 16 Y values -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -623,7 +623,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -657,7 +657,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -691,7 +691,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -725,7 +725,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -759,7 +759,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -793,7 +793,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -859,7 +859,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -929,7 +929,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -995,7 +995,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1065,7 +1065,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1131,7 +1131,7 @@ __asm { } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1268,7 +1268,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; __asm packuswb xmm2, xmm2 /* R */ \ } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1308,7 +1308,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1348,7 +1348,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1388,7 +1388,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1428,7 +1428,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1468,7 +1468,7 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1508,7 +1508,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1575,7 +1575,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, #endif #ifdef HAS_YTOARGBROW_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) void YToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width) { @@ -1628,7 +1628,7 @@ static const uvec8 kShuffleMirror = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; -__declspec(naked) +__declspec(naked) __declspec(align(16)) void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -1653,7 +1653,7 @@ __asm { #ifdef HAS_MIRRORROW_SSE2 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 // version can not. -__declspec(naked) +__declspec(naked) __declspec(align(16)) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -1686,7 +1686,7 @@ static const uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; -__declspec(naked) +__declspec(naked) __declspec(align(16)) void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1717,7 +1717,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif #ifdef HAS_SPLITUV_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi @@ -1756,7 +1756,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time -__declspec(naked) +__declspec(naked) __declspec(align(16)) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { __asm { mov eax, [esp + 4] // src @@ -1779,7 +1779,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_X86 -__declspec(naked) +__declspec(naked) __declspec(align(16)) void CopyRow_X86(const uint8* src, uint8* dst, int count) { __asm { mov eax, esi @@ -1797,7 +1797,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { #endif #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -1823,7 +1823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_y, int pix) { __asm { @@ -1867,7 +1867,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -1893,7 +1893,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_y, int pix) { __asm { @@ -1937,7 +1937,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -1961,7 +1961,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_y, int pix) { __asm { @@ -2005,7 +2005,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -2029,7 +2029,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, } } -__declspec(naked) +__declspec(naked) __declspec(align(16)) void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_y, int pix) { __asm { @@ -2078,273 +2078,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, // Blend 8 pixels at a time // Destination aligned to 16 bytes, multiple of 4 pixels __declspec(naked) __declspec(align(16)) -void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm7, xmm7 // generate constant 1 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - align 16 - convertloop: - movdqu xmm3, [eax] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [edx] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [edx] // _a_g - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx], xmm0 - jle done - - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [edx + 16] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [edx + 16] // _a_g - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - jg convertloop - - done: - ret - } -} - -// Blend 1 pixel at a time, unaligned -__declspec(naked) __declspec(align(16)) -void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm7, xmm7 // generate constant 1 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - align 16 - convertloop: - movd xmm3, [eax] - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [edx] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3,0F5h // 8 alpha words - pshuflw xmm3, xmm3,0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [edx] // _a_g - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 1 - movd [edx], xmm0 - lea edx, [edx + 4] - jg convertloop - ret - } -} - -void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb, dst_argb, count); - src_argb += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels - if (width & ~3) { - ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb, dst_argb, width); - } -} -#endif // HAS_ARGBBLENDROW_SSE2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Blend 8 pixels at a time -// Shuffle table for reversing the bytes. -static const uvec8 kShuffleAlpha = { - 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, - 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 -}; - -// Same as SSE2, but replaces -// psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3,0F5h // 8 alpha words -// pshuflw xmm3, xmm3,0F5h -// with.. -// pshufb xmm3, kShuffleAlpha // alpha - -// Destination aligned to 16 bytes, multiple of 4 pixels -__declspec(naked) __declspec(align(16)) -void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb, - int width) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm7, xmm7 // generate constant 1 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - - align 16 - convertloop: - movdqu xmm3, [eax] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - pshufb xmm3, kShuffleAlpha // alpha - movdqa xmm2, [edx] // _r_b - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [edx] // _a_g - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - movdqu xmm3, [eax + 16] - lea eax, [eax + 32] - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx], xmm0 - jle done - - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [edx + 16] // _r_b - pshufb xmm3, kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [edx + 16] // _a_g - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqa [edx + 16], xmm0 - lea edx, [edx + 32] - jg convertloop - - done: - ret - } -} - -void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - // Do 1 to 3 pixels to get destination aligned. - if ((uintptr_t)(dst_argb) & 15) { - int count = width; - if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { - count = (-(intptr_t)(dst_argb) >> 2) & 3; - } - ARGBBlendRow1_SSE2(src_argb, dst_argb, count); - src_argb += count * 4; - dst_argb += count * 4; - width -= count; - } - // Do multiple of 4 pixels - if (width & ~3) { - ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3); - } - // Do remaining 1 to 3 pixels - if (width & 3) { - src_argb += (width & ~3) * 4; - dst_argb += (width & ~3) * 4; - width &= 3; - ARGBBlendRow1_SSE2(src_argb, dst_argb, width); - } -} -#endif // HAS_ARGBBLENDROW_SSSE3 - - - - - - - -/////////////////////////////////////// -///////////////////// 2 source versions -/////////////////////////////////////// - - - - - - - -#ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time -// Destination aligned to 16 bytes, multiple of 4 pixels -__declspec(naked) __declspec(align(16)) -void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { push esi @@ -2418,7 +2152,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixel at a time, unaligned __declspec(naked) __declspec(align(16)) -void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { push esi @@ -2467,7 +2201,7 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1, } } -void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { // Do 1 to 3 pixels to get destination aligned. if ((uintptr_t)(dst_argb) & 15) { @@ -2475,7 +2209,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { count = (-(intptr_t)(dst_argb) >> 2) & 3; } - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); src_argb0 += count * 4; src_argb1 += count * 4; dst_argb += count * 4; @@ -2483,7 +2217,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, } // Do multiple of 4 pixels if (width & ~3) { - ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); + ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3); } // Do remaining 1 to 3 pixels if (width & 3) { @@ -2491,12 +2225,18 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, src_argb1 += (width & ~3) * 4; dst_argb += (width & ~3) * 4; width &= 3; - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); } } #endif // HAS_ARGBBLENDROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSSE3 +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleAlpha = { + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 +}; + // Blend 8 pixels at a time // Shuffle table for reversing the bytes. @@ -2509,7 +2249,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1, // Destination aligned to 16 bytes, multiple of 4 pixels __declspec(naked) __declspec(align(16)) -void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { push esi @@ -2577,7 +2317,7 @@ void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } } -void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, +void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { // Do 1 to 3 pixels to get destination aligned. if ((uintptr_t)(dst_argb) & 15) { @@ -2585,7 +2325,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) { count = (-(intptr_t)(dst_argb) >> 2) & 3; } - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count); src_argb0 += count * 4; src_argb1 += count * 4; dst_argb += count * 4; @@ -2593,7 +2333,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } // Do multiple of 4 pixels if (width & ~3) { - ARGBBlend2Row_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); + ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); } // Do remaining 1 to 3 pixels if (width & 3) { @@ -2601,7 +2341,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1, src_argb1 += (width & ~3) * 4; dst_argb += (width & ~3) * 4; width &= 3; - ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width); + ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width); } } #endif // HAS_ARGBBLENDROW_SSSE3 diff --git a/source/scale.cc b/source/scale.cc index bd9127095..60b39a519 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) { * */ -#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SCALEROWDOWN2_NEON void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, uint8* dst, int dst_width) { @@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr, */ // Constants for SSE2 code -#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \ - !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && \ + (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) + #if defined(_MSC_VER) #define TALIGN16(t, var) __declspec(align(16)) t _ ## var -#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \ - defined(__i386__) +#elif defined(__i386__) && \ + (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) #define TALIGN16(t, var) t var __attribute__((aligned(16))) #else #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) @@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; #endif -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SCALEROWDOWN2_SSE2 // Reads 32 pixels, throws half away and writes 16 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, } // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN4_SSE2 // Point samples 32 pixels to 8 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, // Blends 32x4 rectangle to 8x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN8_SSE2 // Point samples 32 pixels to 4 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, // Blends 32x8 rectangle to 4x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, #define HAS_SCALEADDROWS_SSE2 // Reads 16xN bytes and produces 16 shorts at a time. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, uint16* dst_ptr, int src_width, int src_height) { @@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. #define HAS_SCALEFILTERROWS_SSE2 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction) { @@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. #define HAS_SCALEFILTERROWS_SSSE3 -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction) { @@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) +__declspec(naked) __declspec(align(16)) static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width) { __asm { @@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } } -#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM) +#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: @@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ); } -#if defined(__i386__) +#if !defined(YUV_DISABLE_ASM) && defined(__i386__) extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( @@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, "ret \n" ); -#elif defined(__x86_64__) +#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__) static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { asm volatile (