From e5f3fd4cc870b9b22112b3b2f25af06e067c8b7d Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 6 Feb 2012 22:40:32 +0000 Subject: [PATCH] YUY2 and UYVY Unaligned and any versions TEST=none BUG=none Review URL: https://webrtc-codereview.appspot.com/379009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@168 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 13 +- source/convert.cc | 32 +- source/format_conversion.cc | 14 +- source/planar_functions.cc | 733 +++++----------------- source/row.h | 259 ++++---- source/row_common.cc | 139 +++-- source/row_neon.cc | 42 +- source/row_posix.cc | 1173 +++++++++++++++++++++-------------- source/row_win.cc | 321 +++++++++- 10 files changed, 1448 insertions(+), 1280 deletions(-) diff --git a/README.chromium b/README.chromium index c755e1faa..6871e2dc4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 167 +Version: 168 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1cc1e3f7f..44c514802 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,16 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -#define LIBYUV_VERSION 167 - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif +#define LIBYUV_VERSION 168 #endif // INCLUDE_LIBYUV_VERSION_H_ + diff --git a/source/convert.cc b/source/convert.cc index 35f71638f..df9288820 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -366,7 +366,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -382,7 +382,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; } @@ -428,7 +428,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = BGRAToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = BGRAToYAnyRow_SSSE3; + ARGBToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = BGRAToYRow_Unaligned_SSSE3; } @@ -444,7 +444,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = BGRAToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = BGRAToUVAnyRow_SSSE3; + ARGBToUVRow = BGRAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3; } @@ -490,7 +490,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ABGRToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ABGRToYAnyRow_SSSE3; + ARGBToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ABGRToYRow_Unaligned_SSSE3; } @@ -506,7 +506,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ABGRToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ABGRToUVAnyRow_SSSE3; + ARGBToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3; } @@ -561,7 +561,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -575,7 +575,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; } else #endif { @@ -630,7 +630,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -644,7 +644,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; } else #endif { @@ -699,7 +699,7 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -713,7 +713,7 @@ int RGB565ToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; } else #endif { @@ -768,7 +768,7 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -782,7 +782,7 @@ int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; } else #endif { @@ -837,7 +837,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { - ARGBToYRow = ARGBToYAnyRow_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; } @@ -851,7 +851,7 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, ARGBToUVRow = ARGBToUVRow_SSSE3; } else if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 2) && width <= kMaxStride) { - ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; } else #endif { diff --git a/source/format_conversion.cc b/source/format_conversion.cc index e0be7a00a..471ed52d4 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -452,22 +452,22 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToBayerRow)(const uint8* src_argb, @@ -490,7 +490,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); dst_bayer += dst_stride_bayer; src_y += src_stride_y; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 99cde2c21..2def26bf7 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -928,457 +928,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, return 0; } -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) -#define HAS_YUY2TOI420ROW_SSE2 -__declspec(naked) -void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja convertloop - ret - } -} - -__declspec(naked) -void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - ja convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) -void YUY2ToI420RowY_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_yuy2 - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 // even bytes are Y - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja convertloop - ret - } -} - -__declspec(naked) -void YUY2ToI420RowUV_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - psrlw xmm0, 8 // YUYV -> UVUV - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - ja convertloop - - pop edi - pop esi - ret - } -} - -#define HAS_UYVYTOI420ROW_SSE2 -__declspec(naked) -void UYVYToI420RowY_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] // src_uyvy - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // odd bytes are Y - psrlw xmm1, 8 - packuswb xmm0, xmm1 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - ja convertloop - ret - } -} - -__declspec(naked) -void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_yuy2 - mov esi, [esp + 8 + 8] // stride_yuy2 - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - sub edi, edx - - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - pand xmm0, xmm5 // UYVY -> UVUV - pand xmm1, xmm5 - packuswb xmm0, xmm1 - movdqa xmm1, xmm0 - pand xmm0, xmm5 // U - packuswb xmm0, xmm0 - psrlw xmm1, 8 // V - packuswb xmm1, xmm1 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + edi], xmm1 - lea edx, [edx + 8] - sub ecx, 16 - ja convertloop - - pop edi - pop esi - ret - } -} - -#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) - -#define HAS_YUY2TOI420ROW_SSE2 -static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif -); -} - -static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_y), // %2 - "+r"(pix) // %3 - : "r"(static_cast(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif -); -} -static void YUY2ToI420RowY_Unaligned_SSE2(const uint8* src_yuy2, - uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif -); -} - -static void YUY2ToI420RowUV_Unaligned_SSE2(const uint8* src_yuy2, - int stride_yuy2, - uint8* dst_u, uint8* dst_y, - int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu (%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_u), // %1 - "+r"(dst_y), // %2 - "+r"(pix) // %3 - : "r"(static_cast(stride_yuy2)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif -); -} -#define HAS_UYVYTOI420ROW_SSE2 -static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, - uint8* dst_y, int pix) { - asm volatile ( -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif -); -} - -static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - "sub %1,%2 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_u), // %1 - "+r"(dst_y), // %2 - "+r"(pix) // %3 - : "r"(static_cast(stride_uyvy)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif -); -} -#endif - -// Filter 2 rows of YUY2 UV's (422) into U and V (420) -void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { - // Output a row of UV values, filtering 2 rows of YUY2 - for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; - dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; - src_yuy2 += 4; - dst_u += 1; - dst_v += 1; - } -} - -void YUY2ToI420RowY_C(const uint8* src_yuy2, - uint8* dst_y, int pix) { - // Copy a row of yuy2 Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_yuy2[0]; - src_yuy2 += 2; - dst_y += 1; - } -} - -void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { - // Copy a row of uyvy UV values - for (int x = 0; x < pix; x += 2) { - dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; - dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; - src_uyvy += 4; - dst_u += 1; - dst_v += 1; - } -} - -void UYVYToI420RowY_C(const uint8* src_uyvy, - uint8* dst_y, int pix) { - // Copy a row of uyvy Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_uyvy[1]; - src_uyvy += 2; - dst_y += 1; - } -} - // Convert YUY2 to I420. int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_y, int dst_stride_y, @@ -1391,36 +940,42 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } - void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, + void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); - void (*YUY2ToI420RowY)(const uint8* src_yuy2, + void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix); - YUY2ToI420RowY = YUY2ToI420RowY_C; - YUY2ToI420RowUV = YUY2ToI420RowUV_C; + YUY2ToYRow = YUY2ToYRow_C; + YUY2ToUVRow = YUY2ToUVRow_C; #if defined(HAS_YUY2TOI420ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - YUY2ToI420RowUV = YUY2ToI420RowUV_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; - YUY2ToI420RowY = YUY2ToI420RowY_Unaligned_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToI420RowY = YUY2ToI420RowY_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + if (width <= kMaxStride) { + YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; + YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUVRow = YUY2ToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_SSE2; + } } } } #endif for (int y = 0; y < height - 1; y += 2) { - YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); + YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); dst_u += dst_stride_u; dst_v += dst_stride_v; - YUY2ToI420RowY(src_yuy2, dst_y, width); - YUY2ToI420RowY(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); + YUY2ToYRow(src_yuy2, dst_y, width); + YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); dst_y += dst_stride_y * 2; src_yuy2 += src_stride_yuy2 * 2; } if (height & 1) { - YUY2ToI420RowUV(src_yuy2, 0, dst_u, dst_v, width); - YUY2ToI420RowY(src_yuy2, dst_y, width); + YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); + YUY2ToYRow(src_yuy2, dst_y, width); } return 0; } @@ -1437,34 +992,42 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } - void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, + void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); - void (*UYVYToI420RowY)(const uint8* src_uyvy, + void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int pix); - UYVYToI420RowY = UYVYToI420RowY_C; - UYVYToI420RowUV = UYVYToI420RowUV_C; + UYVYToYRow = UYVYToYRow_C; + UYVYToUVRow = UYVYToUVRow_C; #if defined(HAS_UYVYTOI420ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToI420RowUV = UYVYToI420RowUV_SSE2; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - UYVYToI420RowY = UYVYToI420RowY_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + if (width <= kMaxStride) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } } } } #endif for (int y = 0; y < height - 1; y += 2) { - UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); + UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); dst_u += dst_stride_u; dst_v += dst_stride_v; - UYVYToI420RowY(src_uyvy, dst_y, width); - UYVYToI420RowY(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); + UYVYToYRow(src_uyvy, dst_y, width); + UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); dst_y += dst_stride_y * 2; src_uyvy += src_stride_uyvy * 2; } if (height & 1) { - UYVYToI420RowUV(src_uyvy, 0, dst_u, dst_v, width); - UYVYToI420RowY(src_uyvy, dst_y, width); + UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); + UYVYToYRow(src_uyvy, dst_y, width); } return 0; } @@ -1481,32 +1044,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON; + I420ToARGBRow = I420ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width); + I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1529,32 +1092,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; dst_stride_bgra = -dst_stride_bgra; } - void (*FastConvertYUVToBGRARow)(const uint8* y_buf, + void (*I420ToBGRARow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON) +#if defined(HAS_I420TOBGRAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRAAnyRow_NEON; + I420ToBGRARow = I420ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 16)) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_NEON; + I420ToBGRARow = I420ToBGRARow_NEON; } } else -#elif defined(HAS_FASTCONVERTYUVTOBGRAROW_SSSE3) +#elif defined(HAS_I420TOBGRAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRAAnyRow_SSSE3; + I420ToBGRARow = I420ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3; + I420ToBGRARow = I420ToBGRARow_SSSE3; } } else #endif { - FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C; + I420ToBGRARow = I420ToBGRARow_C; } for (int y = 0; y < height; ++y) { - FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_bgra, width); + I420ToBGRARow(src_y, src_u, src_v, dst_bgra, width); dst_bgra += dst_stride_bgra; src_y += src_stride_y; if (y & 1) { @@ -1577,32 +1140,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; dst_stride_abgr = -dst_stride_abgr; } - void (*FastConvertYUVToABGRRow)(const uint8* y_buf, + void (*I420ToABGRRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON) +#if defined(HAS_I420TOABGRROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRAnyRow_NEON; + I420ToABGRRow = I420ToABGRRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_NEON; + I420ToABGRRow = I420ToABGRRow_NEON; } } else -#elif defined(HAS_FASTCONVERTYUVTOABGRROW_SSSE3) +#elif defined(HAS_I420TOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRAnyRow_SSSE3; + I420ToABGRRow = I420ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3; + I420ToABGRRow = I420ToABGRRow_SSSE3; } } else #endif { - FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C; + I420ToABGRRow = I420ToABGRRow_C; } for (int y = 0; y < height; ++y) { - FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_abgr, width); + I420ToABGRRow(src_y, src_u, src_v, dst_abgr, width); dst_abgr += dst_stride_abgr; src_y += src_stride_y; if (y & 1) { @@ -1625,29 +1188,29 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); #if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRGB24Row = ARGBToRGB24AnyRow_SSSE3; + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -1659,7 +1222,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRGB24Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -1683,29 +1246,29 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); #if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRAWRow = ARGBToRAWAnyRow_SSSE3; + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; @@ -1717,7 +1280,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRAWRow(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -1741,29 +1304,29 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_rgb, uint8* dst_rgb, int pix); #if defined(HAS_ARGBTORGB565ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565AnyRow_SSE2; + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } @@ -1774,7 +1337,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRGB565Row(row, dst_rgb, width); dst_rgb += dst_stride_rgb; src_y += src_stride_y; @@ -1798,29 +1361,29 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix); #if defined(HAS_ARGBTOARGB1555ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555AnyRow_SSE2; + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } @@ -1831,7 +1394,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToARGB1555Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -1855,29 +1418,29 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix); #if defined(HAS_ARGBTOARGB4444ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444AnyRow_SSE2; + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } @@ -1888,7 +1451,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + I420ToARGBRow(src_y, src_u, src_v, row, width); ARGBToARGB4444Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -1912,33 +1475,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON; + I420ToARGBRow = I420ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } for (int y = 0; y < height; ++y) { - FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width); + I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; @@ -1959,23 +1522,23 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUV444ToARGBRow)(const uint8* y_buf, + void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUV444TOARGBROW_SSSE3) +#if defined(HAS_I444TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3; + I444ToARGBRow = I444ToARGBRow_SSSE3; } else #endif { - FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C; + I444ToARGBRow = I444ToARGBRow_C; } for (int y = 0; y < height; ++y) { - FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width); + I444ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; @@ -1994,21 +1557,21 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYToARGBRow)(const uint8* y_buf, + void (*YToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYTOARGBROW_SSE2) +#if defined(HAS_YTOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; + YToARGBRow = YToARGBRow_SSE2; } else #endif { - FastConvertYToARGBRow = FastConvertYToARGBRow_C; + YToARGBRow = YToARGBRow_C; } for (int y = 0; y < height; ++y) { - FastConvertYToARGBRow(src_y, dst_argb, width); + YToARGBRow(src_y, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } @@ -2205,7 +1768,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, #if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToRGB24Row = ARGBToRGB24AnyRow_SSSE3; + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -2237,7 +1800,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, #if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToRAWRow = ARGBToRAWAnyRow_SSSE3; + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; @@ -2267,29 +1830,29 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_NEON; + I420ToARGBRow = I420ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBAnyRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } int halfwidth = (width + 1) >> 1; @@ -2315,7 +1878,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); src_uv += src_stride_uv; } - FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); + I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } @@ -2333,22 +1896,22 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } - void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + void (*I420ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; + I420ToARGBRow = I420ToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +#elif defined(HAS_I420TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; + I420ToARGBRow = I420ToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + I420ToARGBRow = I420ToARGBRow_C; } SIMD_ALIGNED(uint8 row[kMaxStride]); @@ -2385,7 +1948,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); src_uv += src_stride_uv; } - FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); + I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); ARGBToRGB565Row(row, dst_rgb, width); dst_rgb += dst_stride_rgb; src_y += src_stride_y; diff --git a/source/row.h b/source/row.h index 20ffdc8d3..5d3d12af0 100644 --- a/source/row.h +++ b/source/row.h @@ -13,6 +13,11 @@ #include "libyuv/basic_types.h" +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + #define kMaxStride (2048 * 4) #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) @@ -34,13 +39,15 @@ #define HAS_BGRATOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 -#define HAS_FASTCONVERTYTOARGBROW_SSE2 -#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 -#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 -#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 -#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 +#define HAS_YTOARGBROW_SSE2 +#define HAS_I420TOARGBROW_SSSE3 +#define HAS_I420TOBGRAROW_SSSE3 +#define HAS_I420TOABGRROW_SSSE3 +#define HAS_I444TOARGBROW_SSSE3 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSE2 +#define HAS_YUY2TOI420ROW_SSE2 +#define HAS_UYVYTOI420ROW_SSE2 #endif // The following are available on Windows platforms @@ -48,7 +55,6 @@ #define HAS_RGB565TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 - #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 @@ -59,14 +65,9 @@ // The following are available on Neon platforms #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_NEON -#define HAS_FASTCONVERTYUVTOARGBROW_NEON -#define HAS_FASTCONVERTYUVTOBGRAROW_NEON -#define HAS_FASTCONVERTYUVTOABGRROW_NEON -#endif - -#ifdef __cplusplus -namespace libyuv { -extern "C" { +#define HAS_I420TOARGBROW_NEON +#define HAS_I420TOBGRAROW_NEON +#define HAS_I420TOABGRROW_NEON #endif #if defined(_MSC_VER) @@ -81,21 +82,21 @@ typedef unsigned char __attribute__((vector_size(16))) uvec8; typedef signed short __attribute__((vector_size(16))) vec16; #endif -void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I420ToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I420ToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -164,114 +165,144 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -void FastConvertYUVToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToBGRARow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToABGRRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYToARGBRow_C(const uint8* y_buf, +void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); -void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width); +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); // 'Any' wrappers use memcpy() -void FastConvertYUVToARGBAnyRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToBGRAAnyRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToBGRARow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToABGRAnyRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToABGRRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void ARGBToRGB24AnyRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRAWAnyRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, +void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, +void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, +void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToBGRAAnyRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToBGRARow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUVToABGRAnyRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +void I420ToABGRRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix); +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix); +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix); + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix); +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix); +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix); + +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); + +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); #ifdef __cplusplus } // extern "C" diff --git a/source/row_common.cc b/source/row_common.cc index add01c803..4fccc0593 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -271,7 +271,7 @@ static __inline uint32 Clip(int32 val) { } static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, - int ashift, int rshift, int gshift, int bshift) { + int ashift, int rshift, int gshift, int bshift) { int32 y1 = (static_cast(y) - 16) * YG; uint32 b = Clip(static_cast((u * UB + v * VB) - (BB) + y1) >> 6); uint32 g = Clip(static_cast((u * UG + v * VG) - (BG) + y1) >> 6); @@ -282,11 +282,11 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, (255u << ashift); } -void FastConvertYUVToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { for (int x = 0; x < width - 1; x += 2) { YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); @@ -300,11 +300,11 @@ void FastConvertYUVToARGBRow_C(const uint8* y_buf, } } -void FastConvertYUVToBGRARow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { for (int x = 0; x < width - 1; x += 2) { YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24); YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24); @@ -318,11 +318,11 @@ void FastConvertYUVToBGRARow_C(const uint8* y_buf, } } -void FastConvertYUVToABGRRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { for (int x = 0; x < width - 1; x += 2) { YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16); YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16); @@ -336,11 +336,11 @@ void FastConvertYUVToABGRRow_C(const uint8* y_buf, } } -void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { for (int x = 0; x < width; ++x) { YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0); y_buf += 1; @@ -350,9 +350,9 @@ void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, } } -void FastConvertYToARGBRow_C(const uint8* y_buf, - uint8* rgb_buf, - int width) { +void YToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width) { for (int x = 0; x < width; ++x) { YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); y_buf += 1; @@ -368,6 +368,51 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) { } } +// Filter 2 rows of YUY2 UV's (422) into U and V (420) +void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + // Output a row of UV values, filtering 2 rows of YUY2 + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +void YUY2ToYRow_C(const uint8* src_yuy2, + uint8* dst_y, int pix) { + // Copy a row of yuy2 Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_yuy2[0]; + src_yuy2 += 2; + dst_y += 1; + } +} + +void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of uyvy UV values + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +void UYVYToYRow_C(const uint8* src_uyvy, + uint8* dst_y, int pix) { + // Copy a row of uyvy Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_uyvy[1]; + src_uyvy += 2; + dst_y += 1; + } +} + // Wrappers to handle odd sizes/alignments #define MAKEYUVANY(NAMEANY, NAME) \ void NAMEANY(const uint8* y_buf, \ @@ -380,15 +425,15 @@ void NAMEANY(const uint8* y_buf, \ memcpy(rgb_buf, row, width << 2); \ } -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) -MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3) -MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3) -MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3) +#if defined(HAS_I420TOARGBROW_SSSE3) +MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3) +MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3) +MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3) #endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) -MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON) -MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON) -MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON) +#if defined(HAS_I420TOARGBROW_NEON) +MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON) +MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON) +MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON) #endif #define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \ @@ -401,27 +446,29 @@ void NAMEANY(const uint8* argb_buf, \ } #if defined(HAS_ARGBTORGB24ROW_SSSE3) -MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3) -MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3) -MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2) -MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2) -MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2) +MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) +MAKEYUVANYRGB(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3) +MAKEYUVANYRGB(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) +MAKEYUVANYRGB(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) +MAKEYUVANYRGB(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) #endif #ifdef HAS_ARGBTOYROW_SSSE3 -#define MAKEARGBTOYANY(NAMEANY, ARGBTOY) \ +#define MAKEANYTOYANY(NAMEANY, ARGBTOY) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ SIMD_ALIGNED(uint8 row[kMaxStride]); \ ARGBTOY(src_argb, row, width); \ memcpy(dst_y, row, width); \ } -MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3) -MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3) -MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3) +MAKEANYTOYANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3) +MAKEANYTOYANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3) +MAKEANYTOYANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3) +MAKEANYTOYANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2) +MAKEANYTOYANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2) -#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV) \ +#define MAKEANYTOUVANY(NAMEANY, ARGBTOUV) \ void NAMEANY(const uint8* src_argb0, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ SIMD_ALIGNED(uint8 row[kMaxStride * 2]); \ @@ -431,9 +478,11 @@ MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3) memcpy(dst_v, row + kMaxStride, halfwidth); \ } -MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3) -MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3) -MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3) +MAKEANYTOUVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3) +MAKEANYTOUVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3) +MAKEANYTOUVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3) +MAKEANYTOUVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2) +MAKEANYTOUVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2) #endif #ifdef __cplusplus diff --git a/source/row_neon.cc b/source/row_neon.cc index 8e4e9b081..e062e8a3f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -55,19 +55,19 @@ extern "C" { "vtrn.u8 d22, d23 \n" \ "vtrn.u8 d16, d17 \n" \ -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) || \ - defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON) || \ - defined(HAS_FASTCONVERTYUVTOABGRROW_NEON) +#if defined(HAS_I420TOARGBROW_NEON) || \ + defined(HAS_I420TOBGRAROW_NEON) || \ + defined(HAS_I420TOABGRROW_NEON) static const vec8 kUVToRB[8] = { 127, 127, 127, 127, 102, 102, 102, 102 }; static const vec8 kUVToG[8] = { -25, -25, -25, -25, -52, -52, -52, -52 }; #endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) -void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +#if defined(HAS_I420TOARGBROW_NEON) +void I420ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" "vld1.u8 {d25}, [%6] \n" @@ -94,12 +94,12 @@ YUVTORGB } #endif -#if defined(HAS_FASTCONVERTYUVTOBGRAROW_NEON) -void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +#if defined(HAS_I420TOBGRAROW_NEON) +void I420ToBGRARow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" "vld1.u8 {d25}, [%6] \n" @@ -127,12 +127,12 @@ YUVTORGB } #endif -#if defined(HAS_FASTCONVERTYUVTOABGRROW_NEON) -void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +#if defined(HAS_I420TOABGRROW_NEON) +void I420ToABGRRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" "vld1.u8 {d25}, [%6] \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index bb213c4da..fe6f62d74 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -71,22 +71,22 @@ CONST uvec8 kShuffleMaskBGRAToARGB = { void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" -"1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm0 \n" - "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "ja 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -100,15 +100,15 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( - "movdqa %3,%%xmm5 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" + "movdqa %3,%%xmm5 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -123,15 +123,15 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { asm volatile ( - "movdqa %3,%%xmm5 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" + "movdqa %3,%%xmm5 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -145,33 +145,33 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -185,33 +185,33 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" - "pshufb %%xmm4,%%xmm2 \n" - "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2,0x20(%1) \n" - "por %%xmm5,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" - "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1,0x10(%1) \n" - "por %%xmm5,%%xmm3 \n" - "movdqa %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -225,28 +225,28 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -262,28 +262,28 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %4,%%xmm5 \n" - "movdqa %3,%%xmm4 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "phaddw %%xmm1,%%xmm0 \n" - "phaddw %%xmm3,%%xmm2 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm2 \n" - "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -302,9 +302,9 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kARGBToU), // %0 "m"(kARGBToV), // %1 @@ -315,42 +315,42 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); asm volatile ( - "sub %1,%2 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -366,9 +366,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm5 \n" + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kARGBToU), // %0 "m"(kARGBToV), // %1 @@ -379,46 +379,46 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); asm volatile ( - "sub %1,%2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,%%xmm7 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm7 \n" - "pavgb %%xmm7,%%xmm0 \n" - "movdqa %%xmm2,%%xmm7 \n" - "shufps $0x88,%%xmm6,%%xmm2 \n" - "shufps $0xdd,%%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm4,%%xmm0 \n" - "pmaddubsw %%xmm4,%%xmm2 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm6 \n" - "phaddw %%xmm2,%%xmm0 \n" - "phaddw %%xmm6,%%xmm1 \n" - "psraw $0x8,%%xmm0 \n" - "psraw $0x8,%%xmm1 \n" - "packsswb %%xmm1,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x10,%3 \n" - "ja 1b \n" + "sub %1,%2 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -432,7 +432,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#ifdef HAS_I420TOARGBROW_SSSE3 #define UB 127 /* min(63,static_cast(2.018 * 64)) */ #define UG -25 /* static_cast(-0.391 * 64 - 0.5) */ #define UR 0 @@ -476,235 +476,235 @@ struct { // Convert 8 pixels #define YUVTORGB \ - "movd (%1),%%xmm0 \n" \ - "movd (%1,%2,1),%%xmm1 \n" \ - "lea 0x4(%1),%1 \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw (%5),%%xmm0 \n" \ - "pmaddubsw 16(%5),%%xmm1 \n" \ - "pmaddubsw 32(%5),%%xmm2 \n" \ - "psubw 48(%5),%%xmm0 \n" \ - "psubw 64(%5),%%xmm1 \n" \ - "psubw 80(%5),%%xmm2 \n" \ - "movq (%0),%%xmm3 \n" \ - "lea 0x8(%0),%0 \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%5),%%xmm3 \n" \ - "pmullw 112(%5),%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" + "movd (%1),%%xmm0 \n" \ + "movd (%1,%2,1),%%xmm1 \n" \ + "lea 0x4(%1),%1 \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw (%5),%%xmm0 \n" \ + "pmaddubsw 16(%5),%%xmm1 \n" \ + "pmaddubsw 32(%5),%%xmm2 \n" \ + "psubw 48(%5),%%xmm0 \n" \ + "psubw 64(%5),%%xmm1 \n" \ + "psubw 80(%5),%%xmm2 \n" \ + "movq (%0),%%xmm3 \n" \ + "lea 0x8(%0),%0 \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%5),%%xmm3 \n" \ + "pmullw 112(%5),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" -void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - "1: \n" - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" - "ja 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - "1: \n" - YUVTORGB - "pcmpeqb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm5 \n" - "movdqa %%xmm5,%%xmm0 \n" - "punpcklwd %%xmm1,%%xmm5 \n" - "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5,(%3) \n" - "movdqa %%xmm0,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" - "ja 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - "1: \n" - YUVTORGB - "punpcklbw %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm2 \n" - "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" - "ja 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - "1: \n" - "movd (%1),%%xmm0 \n" - "movd (%1,%2,1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pmaddubsw (%5),%%xmm0 \n" - "pmaddubsw 16(%5),%%xmm1 \n" - "pmaddubsw 32(%5),%%xmm2 \n" - "psubw 48(%5),%%xmm0 \n" - "psubw 64(%5),%%xmm1 \n" - "psubw 80(%5),%%xmm2 \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm4,%%xmm3 \n" - "psubsw 96(%5),%%xmm3 \n" - "pmullw 112(%5),%%xmm3 \n" - "paddsw %%xmm3,%%xmm0 \n" - "paddsw %%xmm3,%%xmm1 \n" - "paddsw %%xmm3,%%xmm2 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm2 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm2,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "movdqa %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "sub $0x4,%4 \n" - "ja 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} -#endif - -#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 - -void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi +void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "mov $0x10001000,%%eax \n" - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "mov $0x012a012a,%%eax \n" - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5,(%3) \n" + "movdqa %%xmm0,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + "movd (%1),%%xmm0 \n" + "movd (%1,%2,1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pmaddubsw (%5),%%xmm0 \n" + "pmaddubsw 16(%5),%%xmm1 \n" + "pmaddubsw 32(%5),%%xmm2 \n" + "psubw 48(%5),%%xmm0 \n" + "psubw 64(%5),%%xmm1 \n" + "psubw 80(%5),%%xmm2 \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm4,%%xmm3 \n" + "psubsw 96(%5),%%xmm3 \n" + "pmullw 112(%5),%%xmm3 \n" + "paddsw %%xmm3,%%xmm0 \n" + "paddsw %%xmm3,%%xmm1 \n" + "paddsw %%xmm3,%%xmm2 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm2 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm2,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "ja 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif + +#ifdef HAS_YTOARGBROW_SSE2 + +void YToARGBRow_SSE2(const uint8* y_buf, // rdi + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "mov $0x10001000,%%eax \n" + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "mov $0x012a012a,%%eax \n" + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,16(%1) \n" - "lea 32(%1),%1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,16(%1) \n" + "lea 32(%1),%1 \n" - "sub $0x8,%2 \n" - "ja 1b \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 "+rm"(width) // %2 @@ -787,15 +787,15 @@ CONST uvec8 kShuffleMirror = { void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "movdqa %3,%%xmm5 \n" - "lea -0x10(%0),%0 \n" + "movdqa %3,%%xmm5 \n" + "lea -0x10(%0),%0 \n" "1: \n" - "movdqa (%0,%2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "ja 1b \n" + "movdqa (%0,%2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -813,20 +813,20 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "lea -0x10(%0),%0 \n" + "lea -0x10(%0),%0 \n" "1: \n" - "movdqu (%0,%2),%%xmm0 \n" - "movdqu %%xmm0,%%xmm1 \n" - "psllw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshufd $0x4e,%%xmm0,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "ja 1b \n" + "movdqu (%0,%2),%%xmm0 \n" + "movdqu %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -839,6 +839,269 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { } #endif +#ifdef HAS_YUY2TOI420ROW_SSE2 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, + int stride_yuy2, + uint8* dst_u, uint8* dst_y, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast(stride_yuy2)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif +); +} + +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%4,1),%%xmm2 \n" + "movdqu 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_y), // %2 + "+r"(pix) // %3 + : "r"(static_cast(stride_uyvy)) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +#endif // HAS_YUY2TOI420ROW_SSE2 + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_win.cc b/source/row_win.cc index be71e0355..74dd3ed9c 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1174,7 +1174,7 @@ __asm { } } -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#ifdef HAS_I420TOARGBROW_SSSE3 #define YG 74 /* static_cast(1.164 * 64 + 0.5) */ @@ -1242,11 +1242,11 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; } __declspec(naked) -void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { __asm { push esi push edi @@ -1282,11 +1282,11 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, } __declspec(naked) -void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { __asm { push esi push edi @@ -1322,11 +1322,11 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, } __declspec(naked) -void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I420ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { __asm { push esi push edi @@ -1362,11 +1362,11 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, } __declspec(naked) -void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { +void I444ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { __asm { push esi push edi @@ -1427,11 +1427,11 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, } #endif -#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +#ifdef HAS_YTOARGBROW_SSE2 __declspec(naked) -void FastConvertYToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +void YToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 @@ -1529,6 +1529,277 @@ __asm { } } #endif + +#ifdef HAS_YUY2TOI420ROW_SSE2 +__declspec(naked) +void YUY2ToYRow_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void UYVYToYRow_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_YUY2TOI420ROW_SSE2 + #ifdef __cplusplus } // extern "C" } // namespace libyuv