diff --git a/README.chromium b/README.chromium index 2396a47a7..348602be8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 69 +Version: 77 License: BSD License File: LICENSE diff --git a/source/planar_functions.cc b/source/planar_functions.cc index ef4ad844f..f84b4cacb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1149,11 +1149,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, (width % 2 == 0)) { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) - if (width % 2 == 0) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX; - } else #endif { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; @@ -1167,8 +1162,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, src_v += src_stride_v; } } - // MMX used for FastConvertYUVToARGBRow requires an emms instruction. - EMMS(); return 0; } @@ -1201,11 +1194,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, (width % 2 == 0)) { FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX) - if (width % 2 == 0) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX; - } else #endif { FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C; @@ -1219,7 +1207,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, src_v += src_stride_v; } } - EMMS(); return 0; } @@ -1252,11 +1239,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, (width % 2 == 0)) { FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX) - if (width % 2 == 0) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX; - } else #endif { FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C; @@ -1270,7 +1252,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, src_v += src_stride_v; } } - EMMS(); return 0; } @@ -1303,11 +1284,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, (width % 2 == 0)) { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) - if (width % 2 == 0) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX; - } else #endif { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; @@ -1319,8 +1295,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, src_u += src_stride_u; src_v += src_stride_v; } - // MMX used for FastConvertYUVToARGBRow requires an emms instruction. - EMMS(); return 0; } @@ -1353,13 +1327,9 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2; } else #endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) - FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX; -#else { FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C; } -#endif for (int y = 0; y < height; ++y) { FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; @@ -1367,8 +1337,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, src_u += src_stride_u; src_v += src_stride_v; } - // MMX used for FastConvertYUVToARGBRow requires an emms instruction. - EMMS(); return 0; } @@ -1391,11 +1359,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) - if (width % 2 == 0) { - FastConvertYToARGBRow = FastConvertYToARGBRow_MMX; - } else #endif { FastConvertYToARGBRow = FastConvertYToARGBRow_C; @@ -1405,8 +1368,6 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, dst_argb += dst_stride_argb; src_y += src_stride_y; } - // MMX used for FastConvertYUVToARGBRow requires an emms instruction. - EMMS(); return 0; } diff --git a/source/rotate.cc b/source/rotate.cc index d85a8cf76..579d2b0ee 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -13,21 +13,19 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "rotate_priv.h" +#include "row.h" namespace libyuv { -#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) -#if defined(_MSC_VER) -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#else -#define TALIGN16(t, var) t var __attribute__((aligned(16))) -#endif +#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(__APPLE__) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + // Shuffle table for reversing the bytes. -extern "C" TALIGN16(const uint8, kShuffleReverse[16]) = +static const uvec8 kShuffleReverse = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; // Shuffle table for reversing the bytes of UV channels. -extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) = +static const uvec8 kShuffleReverseUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; #endif @@ -73,7 +71,7 @@ __asm { mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width - convertloop : + convertloop: // Read in the data from the source pointer. // First round of bit swap. movq xmm0, qword ptr [eax] @@ -172,7 +170,7 @@ __asm { and esp, ~15 mov [esp + 16], ecx mov ecx, [ecx + 16 + 28] // w - convertloop : + convertloop: // Read in the data from the source pointer. // First round of bit swap. movdqa xmm0, [eax] @@ -863,9 +861,9 @@ __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - movdqa xmm5, _kShuffleReverse + movdqa xmm5, kShuffleReverse lea eax, [eax + ecx - 16] - convertloop : + convertloop: movdqa xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 @@ -878,12 +876,16 @@ __asm { } #elif (defined(__i386__) || defined(__x86_64__)) && \ + !defined(__APPLE__) && \ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_REVERSE_LINE_SSSE3 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "movdqa (%3),%%xmm5 \n" + "movdqa %0,%%xmm5 \n" + :: "m"(kShuffleReverse) + ); + asm volatile ( "lea -0x10(%0,%2,1),%0 \n" "1: \n" "movdqa (%0),%%xmm0 \n" @@ -896,12 +898,12 @@ static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 - : "r"(kShuffleReverse) // %3 + : : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm5" #endif -); + ); } #endif @@ -1066,10 +1068,10 @@ __asm { mov edx, [esp + 4 + 8] // dst_a mov edi, [esp + 4 + 12] // dst_b mov ecx, [esp + 4 + 16] // width - movdqa xmm5, _kShuffleReverseUV + movdqa xmm5, kShuffleReverseUV lea eax, [eax + ecx * 2 - 16] - convertloop : + convertloop: movdqa xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 @@ -1085,6 +1087,7 @@ __asm { } #elif (defined(__i386__) || defined(__x86_64__)) && \ + !defined(__APPLE__) && \ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_REVERSE_LINE_UV_SSSE3 void ReverseLineUV_SSSE3(const uint8* src, @@ -1092,28 +1095,31 @@ void ReverseLineUV_SSSE3(const uint8* src, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "movdqa (%4),%%xmm5 \n" - "lea -0x10(%0,%3,2),%0 \n" + "movdqa %0,%%xmm5 \n" + :: "m"(kShuffleReverseUV) + ); + asm volatile ( + "lea -16(%0,%3,2),%0 \n" "1: \n" "movdqa (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" + "lea -16(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" "movlpd %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" + "lea 8(%1),%1 \n" "movhpd %%xmm0,(%2) \n" - "lea 0x8(%2),%2 \n" - "sub $0x8,%3 \n" + "lea 8(%2),%2 \n" + "sub $8,%3 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 "+r"(temp_width) // %3 - : "r"(kShuffleReverseUV) // %4 + : : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm5" #endif -); + ); } #endif diff --git a/source/row.h b/source/row.h index 35489ccc7..92ed6a805 100644 --- a/source/row.h +++ b/source/row.h @@ -51,15 +51,6 @@ #define HAS_FASTCONVERTYTOARGBROW_SSE2 #endif -// The following are available on Windows and GCC 32 bit -#if (defined(WIN32) || \ - defined(__i386__)) && \ - !defined(LIBYUV_DISABLE_ASM) -#define HAS_FASTCONVERTYUVTOARGBROW_MMX -#define HAS_FASTCONVERTYUVTOBGRAROW_MMX -#define HAS_FASTCONVERTYUVTOABGRROW_MMX -#endif - // The following are available on Windows #if defined(WIN32) && \ !defined(LIBYUV_DISABLE_ASM) @@ -128,12 +119,14 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +typedef __declspec(align(16)) signed char vec8[16]; +typedef __declspec(align(16)) unsigned char uvec8[16]; +typedef __declspec(align(16)) signed short vec16[8]; #else // __GNUC__ #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) typedef signed char __attribute__((vector_size(16))) vec8; typedef unsigned char __attribute__((vector_size(16))) uvec8; +typedef signed short __attribute__((vector_size(16))) vec16; #endif extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); @@ -204,36 +197,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, int width); #endif -#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX -void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void FastConvertYToARGBRow_MMX(const uint8* y_buf, - uint8* rgb_buf, - int width); -#endif - #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -268,42 +231,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, #endif -// Method to force C version. -//#define USE_MMX 0 -//#define USE_SSE2 0 - -#if !defined(USE_MMX) -// Windows, Mac and Linux use MMX -#if defined(__i386__) || defined(_MSC_VER) -#define USE_MMX 1 -#else -#define USE_MMX 0 -#endif -#endif - -#if !defined(USE_SSE2) -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 -#define USE_SSE2 1 -#else -#define USE_SSE2 0 -#endif -#endif - -// x64 uses MMX2 (SSE) so emms is not required. -// Warning C4799: function has no EMMS instruction. -// EMMS() is slow and should be called by the calling function once per image. -#if USE_MMX && !defined(ARCH_CPU_X86_64) -#if defined(_MSC_VER) -#define EMMS() __asm emms -#pragma warning(disable: 4799) -#else -#define EMMS() asm("emms") -#endif -#else -#define EMMS() -#endif - - } // extern "C" #endif // LIBYUV_SOURCE_ROW_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc index 2bb7575a0..0b68a920d 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -542,231 +542,6 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi #endif ); } - -#endif - -#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX -// 32 bit mmx gcc version - -#ifdef OSX -#define UNDERSCORE "_" -#else -#define UNDERSCORE "" -#endif - -void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text \n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToARGBRow_MMX \n" -"_FastConvertYUVToARGBRow_MMX: \n" -#else - ".global FastConvertYUVToARGBRow_MMX \n" -"FastConvertYUVToARGBRow_MMX: \n" -#endif - "pusha \n" - "mov 0x24(%esp),%edx \n" - "mov 0x28(%esp),%edi \n" - "mov 0x2c(%esp),%esi \n" - "mov 0x30(%esp),%ebp \n" - "mov 0x34(%esp),%ecx \n" - -"1: \n" - "movzbl (%edi),%eax \n" - "lea 1(%edi),%edi \n" - "movzbl (%esi),%ebx \n" - "lea 1(%esi),%esi \n" - "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax \n" - "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx \n" - "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx \n" - "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1 \n" - "paddsw %mm0,%mm2 \n" - "psraw $0x6,%mm1 \n" - "psraw $0x6,%mm2 \n" - "packuswb %mm2,%mm1 \n" - "movq %mm1,0x0(%ebp) \n" - "lea 8(%ebp),%ebp \n" - "sub $0x2,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text \n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToBGRARow_MMX \n" -"_FastConvertYUVToBGRARow_MMX: \n" -#else - ".global FastConvertYUVToBGRARow_MMX \n" -"FastConvertYUVToBGRARow_MMX: \n" -#endif - "pusha \n" - "mov 0x24(%esp),%edx \n" - "mov 0x28(%esp),%edi \n" - "mov 0x2c(%esp),%esi \n" - "mov 0x30(%esp),%ebp \n" - "mov 0x34(%esp),%ecx \n" - -"1: \n" - "movzbl (%edi),%eax \n" - "lea 1(%edi),%edi \n" - "movzbl (%esi),%ebx \n" - "lea 1(%esi),%esi \n" - "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax \n" - "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx \n" - "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx \n" - "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1 \n" - "paddsw %mm0,%mm2 \n" - "psraw $0x6,%mm1 \n" - "psraw $0x6,%mm2 \n" - "packuswb %mm2,%mm1 \n" - "movq %mm1,0x0(%ebp) \n" - "lea 8(%ebp),%ebp \n" - "sub $0x2,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text \n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToABGRRow_MMX \n" -"_FastConvertYUVToABGRRow_MMX: \n" -#else - ".global FastConvertYUVToABGRRow_MMX \n" -"FastConvertYUVToABGRRow_MMX: \n" -#endif - "pusha \n" - "mov 0x24(%esp),%edx \n" - "mov 0x28(%esp),%edi \n" - "mov 0x2c(%esp),%esi \n" - "mov 0x30(%esp),%ebp \n" - "mov 0x34(%esp),%ecx \n" - -"1: \n" - "movzbl (%edi),%eax \n" - "lea 1(%edi),%edi \n" - "movzbl (%esi),%ebx \n" - "lea 1(%esi),%esi \n" - "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax \n" - "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx \n" - "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx \n" - "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1 \n" - "paddsw %mm0,%mm2 \n" - "psraw $0x6,%mm1 \n" - "psraw $0x6,%mm2 \n" - "packuswb %mm2,%mm1 \n" - "movq %mm1,0x0(%ebp) \n" - "lea 8(%ebp),%ebp \n" - "sub $0x2,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text \n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUV444ToARGBRow_MMX \n" -"_FastConvertYUV444ToARGBRow_MMX: \n" -#else - ".global FastConvertYUV444ToARGBRow_MMX \n" -"FastConvertYUV444ToARGBRow_MMX: \n" -#endif - "pusha \n" - "mov 0x24(%esp),%edx \n" - "mov 0x28(%esp),%edi \n" - "mov 0x2c(%esp),%esi \n" - "mov 0x30(%esp),%ebp \n" - "mov 0x34(%esp),%ecx \n" - -"1: \n" - "movzbl (%edi),%eax \n" - "lea 1(%edi),%edi \n" - "movzbl (%esi),%ebx \n" - "lea 1(%esi),%esi \n" - "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax \n" - "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "lea 1(%edx),%edx \n" - "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n" - "psraw $0x6,%mm0 \n" - "packuswb %mm0,%mm0 \n" - "movd %mm0,0x0(%ebp) \n" - "lea 4(%ebp),%ebp \n" - "sub $0x1,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - -void FastConvertYToARGBRow_MMX(const uint8* y_buf, - uint8* rgb_buf, - int width); - asm( - ".text \n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYToARGBRow_MMX \n" -"_FastConvertYToARGBRow_MMX: \n" -#else - ".global FastConvertYToARGBRow_MMX \n" -"FastConvertYToARGBRow_MMX: \n" -#endif - "push %ebx \n" - "mov 0x8(%esp),%eax \n" - "mov 0xc(%esp),%edx \n" - "mov 0x10(%esp),%ecx \n" - -"1: \n" - "movzbl (%eax),%ebx \n" - "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n" - "psraw $0x6,%mm0 \n" - "movzbl 0x1(%eax),%ebx \n" - "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n" - "psraw $0x6,%mm1 \n" - "packuswb %mm1,%mm0 \n" - "lea 0x2(%eax),%eax \n" - "movq %mm0,(%edx) \n" - "lea 0x8(%edx),%edx \n" - "sub $0x2,%ecx \n" - "ja 1b \n" - "pop %ebx \n" - "ret \n" -); - #endif #ifdef HAS_ARGBTOYROW_SSSE3 diff --git a/source/row_win.cc b/source/row_win.cc index a0b1cc594..bd8c33cce 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -15,71 +15,71 @@ extern "C" { #ifdef HAS_ARGBTOYROW_SSSE3 // Constant multiplication table for converting ARGB to I400. -SIMD_ALIGNED(const int8 kARGBToY[16]) = { +static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; -SIMD_ALIGNED(const int8 kARGBToU[16]) = { +static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -SIMD_ALIGNED(const int8 kARGBToV[16]) = { +static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; // Constants for BGRA -SIMD_ALIGNED(const int8 kBGRAToY[16]) = { +static const vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 }; -SIMD_ALIGNED(const int8 kBGRAToU[16]) = { +static const vec8 kBGRAToU = { 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 }; -SIMD_ALIGNED(const int8 kBGRAToV[16]) = { +static const vec8 kBGRAToV = { 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 }; // Constants for ABGR -SIMD_ALIGNED(const int8 kABGRToY[16]) = { +static const vec8 kABGRToY = { 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 }; -SIMD_ALIGNED(const int8 kABGRToU[16]) = { +static const vec8 kABGRToU = { -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 }; -SIMD_ALIGNED(const int8 kABGRToV[16]) = { +static const vec8 kABGRToV = { 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 }; -SIMD_ALIGNED(const uint8 kAddY16[16]) = { +static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, }; -SIMD_ALIGNED(const uint8 kAddUV128[16]) = { +static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; // Shuffle table for converting BG24 to ARGB. -SIMD_ALIGNED(const uint8 kShuffleMaskBG24ToARGB[16]) = { +static const uvec8 kShuffleMaskBG24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -SIMD_ALIGNED(const uint8 kShuffleMaskRAWToARGB[16]) = { +static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; // Shuffle table for converting ABGR to ARGB. -SIMD_ALIGNED(const uint8 kShuffleMaskABGRToARGB[16]) = { +static const uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; // Shuffle table for converting BGRA to ARGB. -SIMD_ALIGNED(const uint8 kShuffleMaskBGRAToARGB[16]) = { +static const uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; @@ -118,7 +118,7 @@ __asm { mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskABGRToARGB - convertloop : + convertloop: movdqa xmm0, [eax] lea eax, [eax + 16] pshufb xmm0, xmm5 @@ -138,7 +138,7 @@ __asm { mov ecx, [esp + 12] // pix movdqa xmm5, kShuffleMaskBGRAToARGB - convertloop : + convertloop: movdqa xmm0, [eax] lea eax, [eax + 16] pshufb xmm0, xmm5 @@ -160,7 +160,7 @@ __asm { pslld xmm5, 24 movdqa xmm4, kShuffleMaskBG24ToARGB - convertloop : + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm3, [eax + 32] @@ -199,7 +199,7 @@ __asm { pslld xmm5, 24 movdqa xmm4, kShuffleMaskRAWToARGB - convertloop : + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm3, [eax + 32] @@ -237,7 +237,7 @@ __asm { movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY - convertloop : + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] @@ -270,7 +270,7 @@ __asm { movdqa xmm5, kAddY16 movdqa xmm4, kBGRAToY - convertloop : + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] @@ -303,7 +303,7 @@ __asm { movdqa xmm5, kAddY16 movdqa xmm4, kABGRToY - convertloop : + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] @@ -343,7 +343,7 @@ __asm { movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -407,7 +407,7 @@ __asm { movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -471,7 +471,7 @@ __asm { movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - convertloop : + convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -519,182 +519,6 @@ __asm { } } -#define YUVTORGB_MMX(TABLE) __asm { \ - __asm convertloop : \ - __asm movzx eax, byte ptr [edi] \ - __asm lea edi, [edi + 1] \ - __asm movzx ebx, byte ptr [esi] \ - __asm lea esi, [esi + 1] \ - __asm movq mm0, [TABLE + 2048 + 8 * eax] \ - __asm movzx eax, byte ptr [edx] \ - __asm paddsw mm0, [TABLE + 4096 + 8 * ebx] \ - __asm movzx ebx, byte ptr [edx + 1] \ - __asm movq mm1, [TABLE + 8 * eax] \ - __asm lea edx, [edx + 2] \ - __asm movq mm2, [TABLE + 8 * ebx] \ - __asm paddsw mm1, mm0 \ - __asm paddsw mm2, mm0 \ - __asm psraw mm1, 6 \ - __asm psraw mm2, 6 \ - __asm packuswb mm1, mm2 \ - __asm movq [ebp], mm1 \ - __asm lea ebp, [ebp + 8] \ - __asm sub ecx, 2 \ - __asm ja convertloop \ - } - -__declspec(naked) -void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - push esi - push edi - push ebp - mov edx, [esp + 16 + 4] - mov edi, [esp + 16 + 8] - mov esi, [esp + 16 + 12] - mov ebp, [esp + 16 + 16] - mov ecx, [esp + 16 + 20] - - YUVTORGB_MMX(kCoefficientsRgbY) - - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -__declspec(naked) -void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - push esi - push edi - push ebp - mov edx, [esp + 16 + 4] - mov edi, [esp + 16 + 8] - mov esi, [esp + 16 + 12] - mov ebp, [esp + 16 + 16] - mov ecx, [esp + 16 + 20] - - YUVTORGB_MMX(kCoefficientsBgraY) - - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -__declspec(naked) -void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - push esi - push edi - push ebp - mov edx, [esp + 16 + 4] - mov edi, [esp + 16 + 8] - mov esi, [esp + 16 + 12] - mov ebp, [esp + 16 + 16] - mov ecx, [esp + 16 + 20] - - YUVTORGB_MMX(kCoefficientsAbgrY) - - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -__declspec(naked) -void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - push esi - push edi - push ebp - mov edx, [esp + 16 + 4] - mov edi, [esp + 16 + 8] - mov esi, [esp + 16 + 12] - mov ebp, [esp + 16 + 16] - mov ecx, [esp + 16 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [kCoefficientsRgbY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [kCoefficientsRgbY + 4096 + 8 * ebx] - lea edx, [edx + 1] - paddsw mm0, [kCoefficientsRgbY + 8 * eax] - psraw mm0, 6 - packuswb mm0, mm0 - movd [ebp], mm0 - lea ebp, [ebp + 4] - sub ecx, 1 - ja convertloop - - pop ebp - pop edi - pop esi - pop ebx - ret - } -} - -__declspec(naked) -void FastConvertYToARGBRow_MMX(const uint8* y_buf, - uint8* rgb_buf, - int width) { - __asm { - push ebx - mov eax, [esp + 4 + 4] // Y - mov edx, [esp + 4 + 8] // rgb - mov ecx, [esp + 4 + 12] // width - - convertloop : - movzx ebx, byte ptr [eax] - movq mm0, [kCoefficientsRgbY + 8 * ebx] - psraw mm0, 6 - movzx ebx, byte ptr [eax + 1] - movq mm1, [kCoefficientsRgbY + 8 * ebx] - psraw mm1, 6 - packuswb mm0, mm1 - lea eax, [eax + 2] - movq [edx], mm0 - lea edx, [edx + 8] - sub ecx, 2 - ja convertloop - - pop ebx - ret - } -} - #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 #define YG 74 /* static_cast(1.164 * 64 + 0.5) */ @@ -712,35 +536,35 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf, #define BG UG * 128 + VG * 128 #define BR UR * 128 + VR * 128 -SIMD_ALIGNED(const int8 kUVToB[16]) = { +static const vec8 kUVToB = { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }; -SIMD_ALIGNED(const int8 kUVToR[16]) = { +static const vec8 kUVToR = { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }; -SIMD_ALIGNED(const int8 kUVToG[16]) = { +static const vec8 kUVToG = { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }; -SIMD_ALIGNED(const int16 kYToRgb[8]) = { +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; -SIMD_ALIGNED(const int16 kYSub16[8]) = { +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; -SIMD_ALIGNED(const int16 kUVBiasB[8]) = { +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; -SIMD_ALIGNED(const int16 kUVBiasG[8]) = { +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; -SIMD_ALIGNED(const int16 kUVBiasR[8]) = { +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; @@ -794,7 +618,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - convertloop : + convertloop: YUVTORGB_SSSE3 // Step 3: Weave into ARGB @@ -833,7 +657,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, sub edi, esi pxor xmm4, xmm4 - convertloop : + convertloop: YUVTORGB_SSSE3 // Step 3: Weave into BGRA @@ -874,7 +698,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - convertloop : + convertloop: YUVTORGB_SSSE3 // Step 3: Weave into ARGB @@ -914,7 +738,7 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - convertloop : + convertloop: // Step 1: Find 4 UV contributions to 4 R,G,B values movd xmm0, [esi] // U movd xmm1, [esi + edi] // V @@ -978,7 +802,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, movdqa xmm3, kYSub16 movdqa xmm2, kYToRgb - convertloop : + convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] diff --git a/source/scale.cc b/source/scale.cc index cad1c538c..b9e310562 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -14,6 +14,7 @@ #include #include "libyuv/cpu_id.h" +#include "row.h" #if defined(_MSC_VER) #define ALIGN16(var) __declspec(align(16)) var @@ -21,6 +22,7 @@ #define ALIGN16(var) var __attribute__((aligned(16))) #endif + // Note: A Neon reference manual // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html // Note: Some SSE2 reference manuals