diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e7a58717d..b5cc8b36a 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -36,7 +36,7 @@ static void SplitUV_NEON(const uint8* src_uv, "+r"(dst_v), "+r"(pix) // Output registers : // Input registers - : "q0", "q1" // Clobber List + : "memory", "cc", "q0", "q1" // Clobber List ); } @@ -1080,6 +1080,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + } else +#endif #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && (width % 4 == 0) && @@ -1132,6 +1139,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width); +#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3; + } else +#endif #if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && (width % 2 == 0)) { @@ -1176,6 +1190,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width); +#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3; + } else +#endif #if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && (width % 2 == 0)) { @@ -1220,6 +1241,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + } else +#endif #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && (width % 2 == 0)) { @@ -1263,6 +1291,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width); +#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3; + } else +#endif #if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2; @@ -1300,10 +1335,10 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, void (*FastConvertYToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 2 == 0) && - IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) { +#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSSE3) && + (width % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; } else #endif diff --git a/source/row.h b/source/row.h index 958a833b1..35489ccc7 100644 --- a/source/row.h +++ b/source/row.h @@ -15,51 +15,61 @@ #define kMaxStride (2048 * 4) +#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) +#define YUV_DISABLE_ASM +#endif + // The following are available on all x86 platforms #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + !defined(LIBYUV_DISABLE_ASM) #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 #define HAS_BG24TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOUVROW_SSSE3 #define HAS_RAWTOUVROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 -#endif - -// The following are available on Windows and Linux -#if (defined(WIN32) || defined(__x86_64__) || \ - (defined(__i386__) && !defined(__pic__))) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_ARGBTOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 #endif // The following are available on Linux (32/64 bit) // TODO(fbarchard): enable for fpic on linux #if (defined(__x86_64__) || \ (defined(__i386__) && !defined(__pic__))) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + !defined(LIBYUV_DISABLE_ASM) #define HAS_FASTCONVERTYUVTOARGBROW_SSE2 #define HAS_FASTCONVERTYUVTOBGRAROW_SSE2 #define HAS_FASTCONVERTYUVTOABGRROW_SSE2 +#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2 +#define HAS_FASTCONVERTYTOARGBROW_SSE2 #endif // The following are available on Windows and GCC 32 bit #if (defined(WIN32) || \ defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) + !defined(LIBYUV_DISABLE_ASM) #define HAS_FASTCONVERTYUVTOARGBROW_MMX #define HAS_FASTCONVERTYUVTOBGRAROW_MMX #define HAS_FASTCONVERTYUVTOABGRROW_MMX #endif +// The following are available on Windows +#if defined(WIN32) && \ + !defined(LIBYUV_DISABLE_ASM) +#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 +#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 +#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_FASTCONVERTYTOARGBROW_SSE2 +#endif + extern "C" { #ifdef HAS_ARGBTOYROW_SSSE3 @@ -224,6 +234,40 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf, int width); #endif +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); + +#endif + // Method to force C version. //#define USE_MMX 0 //#define USE_SSE2 0 diff --git a/source/row_posix.cc b/source/row_posix.cc index ad6202e15..f355122f6 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -253,37 +253,47 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { - asm volatile( - "movdqa %5,%%xmm7\n" - "movdqa %6,%%xmm6\n" - "movdqa %7,%%xmm5\n" + asm volatile( + "movdqa %0,%%xmm4\n" + "movdqa %1,%%xmm3\n" + "movdqa %2,%%xmm5\n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile( "sub %1,%2\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" "movdqa 0x20(%0),%%xmm2\n" - "movdqa 0x30(%0),%%xmm3\n" + "movdqa 0x30(%0),%%xmm6\n" "pavgb (%0,%4,1),%%xmm0\n" "pavgb 0x10(%0,%4,1),%%xmm1\n" "pavgb 0x20(%0,%4,1),%%xmm2\n" - "pavgb 0x30(%0,%4,1),%%xmm3\n" + "pavgb 0x30(%0,%4,1),%%xmm6\n" "lea 0x40(%0),%0\n" - "movdqa %%xmm0,%%xmm4\n" + "movdqa %%xmm0,%%xmm7\n" "shufps $0x88,%%xmm1,%%xmm0\n" - "shufps $0xdd,%%xmm1,%%xmm4\n" - "pavgb %%xmm4,%%xmm0\n" - "movdqa %%xmm2,%%xmm4\n" - "shufps $0x88,%%xmm3,%%xmm2\n" - "shufps $0xdd,%%xmm3,%%xmm4\n" - "pavgb %%xmm4,%%xmm2\n" + "shufps $0xdd,%%xmm1,%%xmm7\n" + "pavgb %%xmm7,%%xmm0\n" + "movdqa %%xmm2,%%xmm7\n" + "shufps $0x88,%%xmm6,%%xmm2\n" + "shufps $0xdd,%%xmm6,%%xmm7\n" + "pavgb %%xmm7,%%xmm2\n" "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm2,%%xmm3\n" - "pmaddubsw %%xmm7,%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm2\n" - "pmaddubsw %%xmm6,%%xmm1\n" - "pmaddubsw %%xmm6,%%xmm3\n" + "movdqa %%xmm2,%%xmm6\n" + "pmaddubsw %%xmm4,%%xmm0\n" + "pmaddubsw %%xmm4,%%xmm2\n" + "pmaddubsw %%xmm3,%%xmm1\n" + "pmaddubsw %%xmm3,%%xmm6\n" "phaddw %%xmm2,%%xmm0\n" - "phaddw %%xmm3,%%xmm1\n" + "phaddw %%xmm6,%%xmm1\n" "psraw $0x8,%%xmm0\n" "psraw $0x8,%%xmm1\n" "packsswb %%xmm1,%%xmm0\n" @@ -297,13 +307,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast(src_stride_argb)), // %4 - "m"(kARGBToU), // %5 - "m"(kARGBToV), // %6 - "m"(kAddUV128) // %7 + : "r"(static_cast(src_stride_argb)) : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif ); } diff --git a/source/row_table.cc b/source/row_table.cc index d9c21d6dc..4d191ac2f 100644 --- a/source/row_table.cc +++ b/source/row_table.cc @@ -208,25 +208,27 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ RGBV(0xFC), RGBV(0xFD), RGBV(0xFE), RGBV(0xFF), \ }; +#define CS(v) static_cast(v) + // ARGB table #define RGBY(i) { \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(256 * 64 - 1) \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(256 * 64 - 1) \ } #define RGBU(i) { \ - static_cast(2.018 * 64 * (i - 128) + 0.5), \ - static_cast(-0.391 * 64 * (i - 128) + 0.5), \ + CS(2.018 * 64 * (i - 128) + 0.5), \ + CS(-0.391 * 64 * (i - 128) - 0.5), \ 0, \ 0 \ } #define RGBV(i) { \ 0, \ - static_cast(-0.813 * 64 * (i - 128) + 0.5), \ - static_cast(1.596 * 64 * (i - 128) + 0.5), \ + CS(-0.813 * 64 * (i - 128) - 0.5), \ + CS(1.596 * 64 * (i - 128) + 0.5), \ 0 \ } @@ -238,23 +240,23 @@ MAKETABLE(kCoefficientsRgbY) // BGRA table #define RGBY(i) { \ - static_cast(256 * 64 - 1), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5) \ + CS(256 * 64 - 1), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5) \ } #define RGBU(i) { \ 0, \ 0, \ - static_cast(-0.391 * 64 * (i - 128) + 0.5), \ - static_cast(2.018 * 64 * (i - 128) + 0.5) \ + CS(-0.391 * 64 * (i - 128) - 0.5), \ + CS(2.018 * 64 * (i - 128) + 0.5) \ } #define RGBV(i) { \ 0, \ - static_cast(1.596 * 64 * (i - 128) + 0.5), \ - static_cast(-0.813 * 64 * (i - 128) + 0.5), \ + CS(1.596 * 64 * (i - 128) + 0.5), \ + CS(-0.813 * 64 * (i - 128) - 0.5), \ 0 \ } @@ -266,22 +268,22 @@ MAKETABLE(kCoefficientsBgraY) // ABGR table #define RGBY(i) { \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(1.164 * 64 * (i - 16) + 0.5), \ - static_cast(256 * 64 - 1) \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(1.164 * 64 * (i - 16) + 0.5), \ + CS(256 * 64 - 1) \ } #define RGBU(i) { \ 0, \ - static_cast(-0.391 * 64 * (i - 128) + 0.5), \ - static_cast(2.018 * 64 * (i - 128) + 0.5), \ + CS(-0.391 * 64 * (i - 128) - 0.5), \ + CS(2.018 * 64 * (i - 128) + 0.5), \ 0 \ } #define RGBV(i) { \ - static_cast(1.596 * 64 * (i - 128) + 0.5), \ - static_cast(-0.813 * 64 * (i - 128) + 0.5), \ + CS(1.596 * 64 * (i - 128) + 0.5), \ + CS(-0.813 * 64 * (i - 128) - 0.5), \ 0, \ 0 \ } diff --git a/source/row_win.cc b/source/row_win.cc index 27d2d0b93..1723eddae 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -520,7 +520,7 @@ __asm { } } -#define YUVTORGB(TABLE) __asm { \ +#define YUVTORGB_MMX(TABLE) __asm { \ __asm convertloop : \ __asm movzx eax, byte ptr [edi] \ __asm lea edi, [edi + 1] \ @@ -561,7 +561,7 @@ void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, mov ebp, [esp + 16 + 16] mov ecx, [esp + 16 + 20] - YUVTORGB(kCoefficientsRgbY) + YUVTORGB_MMX(kCoefficientsRgbY) pop ebp pop edi @@ -588,7 +588,7 @@ void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, mov ebp, [esp + 16 + 16] mov ecx, [esp + 16 + 20] - YUVTORGB(kCoefficientsBgraY) + YUVTORGB_MMX(kCoefficientsBgraY) pop ebp pop edi @@ -615,7 +615,7 @@ void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, mov ebp, [esp + 16 + 16] mov ecx, [esp + 16 + 20] - YUVTORGB(kCoefficientsAbgrY) + YUVTORGB_MMX(kCoefficientsAbgrY) pop ebp pop edi @@ -696,6 +696,321 @@ void FastConvertYToARGBRow_MMX(const uint8* y_buf, } } +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 + +#define YG 74 /* static_cast(1.164 * 64 + 0.5) */ + +#define UB 127 /* min(63,static_cast(2.018 * 64)) */ +#define UG -25 /* static_cast(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* static_cast(-0.813 * 64 - 0.5) */ +#define VR 102 /* static_cast(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +extern "C" TALIGN16(const int8, kUVToB[16]) = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +extern "C" TALIGN16(const int8, kUVToR[16]) = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +extern "C" TALIGN16(const int8, kUVToG[16]) = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +extern "C" TALIGN16(const int16, kYToRgb[8]) = { + YG, YG, YG, YG, YG, YG, YG, YG +}; + +extern "C" TALIGN16(const int16, kYSub16[8]) = { + 16, 16, 16, 16, 16, 16, 16, 16 +}; + +extern "C" TALIGN16(const int16, kUVBiasB[8]) = { + BB, BB, BB, BB, BB, BB, BB, BB +}; + +extern "C" TALIGN16(const int16, kUVBiasG[8]) = { + BG, BG, BG, BG, BG, BG, BG, BG +}; + +extern "C" TALIGN16(const int16, kUVBiasR[8]) = { + BR, BR, BR, BR, BR, BR, BR, BR +}; + +#define YUVTORGB_SSSE3 __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, _kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, _kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, _kUVToR /* scale R UV */ \ + __asm psubw xmm0, _kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, _kUVBiasG \ + __asm psubw xmm2, _kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, _kYSub16 \ + __asm pmullw xmm3, _kYToRgb \ + __asm paddw xmm0, xmm3 /* B += Y */ \ + __asm paddw xmm1, xmm3 /* G += Y */ \ + __asm paddw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +__declspec(naked) +void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + convertloop : + YUVTORGB_SSSE3 + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + movdqa [edx], xmm0 + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + + sub ecx, 8 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + convertloop : + YUVTORGB_SSSE3 + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + movdqa [edx], xmm5 + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + + sub ecx, 8 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + convertloop : + YUVTORGB_SSSE3 + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + movdqa [edx], xmm2 + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + + sub ecx, 8 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + convertloop : + // Step 1: Find 4 UV contributions to 4 R,G,B values + movd xmm0, [esi] // U + movd xmm1, [esi + edi] // V + lea esi, [esi + 4] + punpcklbw xmm0, xmm1 // UV + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pmaddubsw xmm0, _kUVToB // scale B UV + pmaddubsw xmm1, _kUVToG // scale G UV + pmaddubsw xmm2, _kUVToR // scale R UV + psubw xmm0, _kUVBiasB // unbias back to signed + psubw xmm1, _kUVBiasG + psubw xmm2, _kUVBiasR + + // Step 2: Find Y contribution to 4 R,G,B values + movd xmm3, [eax] + lea eax, [eax + 4] + punpcklbw xmm3, xmm4 + psubsw xmm3, _kYSub16 + pmullw xmm3, _kYToRgb + paddw xmm0, xmm3 // B += Y + paddw xmm1, xmm3 // G += Y + paddw xmm2, xmm3 // R += Y + psraw xmm0, 6 + psraw xmm1, 6 + psraw xmm2, 6 + packuswb xmm0, xmm0 // B + packuswb xmm1, xmm1 // G + packuswb xmm2, xmm2 // R + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + punpcklwd xmm0, xmm2 // BGRA 4 pixels + movdqa [edx], xmm0 + lea edx, [edx + 16] + + sub ecx, 4 + ja convertloop + + pop edi + pop esi + ret + } +} +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 + +__declspec(naked) +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + pxor xmm4, xmm4 + movdqa xmm3, _kYSub16 + movdqa xmm2, _kYToRgb + + convertloop : + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm4 + psubsw xmm0, xmm3 + pmullw xmm0, xmm2 + psraw xmm0, 6 + packuswb xmm0, xmm0 // G + + // Step 2: Weave into ARGB + punpcklbw xmm0, xmm0 // GG + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 // BGRA first 4 pixels + por xmm0, xmm5 + movdqa [edx], xmm0 + punpckhwd xmm1, xmm1 // BGRA next 4 pixels + por xmm1, xmm5 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + + sub ecx, 8 + ja convertloop + + ret + } +} + +#endif #endif } // extern "C" + + +