diff --git a/include/planar_functions.h b/include/planar_functions.h index 469b31d37..a078b6762 100644 --- a/include/planar_functions.h +++ b/include/planar_functions.h @@ -19,17 +19,8 @@ namespace libyuv { class PlanarFunctions { public: - // Copy I420 to I420. - static void I420Copy(const uint8* src_y, int src_pitch_y, - const uint8* src_u, int src_pitch_u, - const uint8* src_v, int src_pitch_v, - uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - int width, int height); - - // Convert I422 to I420. Used by MJPG. - static void I422ToI420(const uint8* src_y, int src_pitch_y, + // Copy I420 to I420. + static void I420Copy(const uint8* src_y, int src_pitch_y, const uint8* src_u, int src_pitch_u, const uint8* src_v, int src_pitch_v, uint8* dst_y, int dst_pitch_y, @@ -37,24 +28,100 @@ class PlanarFunctions { uint8* dst_v, int dst_pitch_v, int width, int height); - // Convert M420 to I420. - static void M420ToI420(uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - const uint8* m420, int pitch_m420, - int width, int height); + // Convert I422 to I420. Used by MJPG. + static void I422ToI420(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); - // Convert NV12 to I420. Also used for NV21. - static void NV12ToI420(uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - const uint8* src_y, - const uint8* src_uv, - int src_pitch, - int width, int height); + // Convert M420 to I420. + static void M420ToI420(const uint8* src_m420, int src_pitch_m420, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); - DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions); -}; + // Convert Q420 to I420. + static void Q420ToI420(const uint8* src_y, int src_pitch_y, + const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert NV12 to I420. Also used for NV21. + static void NV12ToI420(const uint8* src_y, + const uint8* src_uv, int src_pitch, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert YUY2 to I420. + static void YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert UYVY to I420. + static void UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height); + + // Convert I420 to ARGB. + static void I420ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert I422 to ARGB. + static void I422ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert I444 to ARGB. + static void I444ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert I400 to ARGB. + static void I400ToARGB(const uint8* src_y, int src_pitch_y, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert I400 to ARGB. + static void I400ToARGB_Reference(const uint8* src_y, int src_pitch_y, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert RAW to ARGB. + static void RAWToARGB(const uint8* src_raw, int src_pitch_raw, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert BG24 to ARGB. + static void BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + // Convert ABGR to ARGB. + static void ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr, + uint8* dst_argb, int dst_pitch_argb, + int width, int height); + + DISALLOW_IMPLICIT_CONSTRUCTORS(PlanarFunctions); + }; } // namespace libyuv diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 452543d07..01829c4cb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -14,6 +14,7 @@ #include #include "cpu_id.h" +#include "row.h" namespace libyuv { @@ -37,49 +38,93 @@ static void SplitUV_NEON(const uint8* src_uv, ); } -#elif defined(WIN32) && !defined(COVERAGE_ENABLED) +#elif (defined(WIN32) || defined(__i386__)) && !defined(COVERAGE_ENABLED) && \ + !defined(__PIC__) && !TARGET_IPHONE_SIMULATOR +#if defined(_MSC_VER) +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#elif defined(OSX) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#else +#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) +#endif + +// shuffle constant to put even bytes in low 8 and odd bytes in high 8 bytes +extern "C" TALIGN16(const uint8, shufevenodd[16]) = + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) #define HAS_SPLITUV_SSE2 +__declspec(naked) static void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { - mov esi, src_uv - mov edi, dst_u - mov edx, dst_v - mov ecx, pix - mov eax, 0x00ff00ff // mask for isolating low bytes - movd xmm7, eax - pshufd xmm7, xmm7, 0 + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 pand xmm0, xmm7 // even bytes pand xmm1, xmm7 packuswb xmm0, xmm1 - movdqa [edi], xmm0 - lea edi, [edi + 16] + movdqa [edx], xmm0 + lea edx, [edx + 16] psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 - movdqa [edx], xmm2 - lea edx, [edx + 16] + movdqa [edi], xmm2 + lea edi, [edi + 16] sub ecx, 16 ja wloop + pop edi + ret } } +#define HAS_SPLITUV_SSSE3 +__declspec(naked) +static void SplitUV_SSSE3(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + movdqa xmm7, _shufevenodd + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm7 // 8 u's and 8 v's + pshufb xmm1, xmm7 // 8 u's and 8 v's + movdqa xmm2, xmm0 + punpcklqdq xmm0, xmm1 // 16 u's + punpckhqdq xmm2, xmm1 // 16 v's + movdqa [edx], xmm0 + lea edx, [edx + 16] + movdqa [edi], xmm2 + lea edi, [edi + 16] + sub ecx, 16 + ja wloop + pop edi + ret + } +} #elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ !TARGET_IPHONE_SIMULATOR - -// GCC version is same as Visual C - #define HAS_SPLITUV_SSE2 extern "C" void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); - asm( ".text\n" #if defined(OSX) @@ -89,41 +134,74 @@ extern "C" void SplitUV_SSE2(const uint8* src_uv, ".global SplitUV_SSE2\n" "SplitUV_SSE2:\n" #endif - "push %ebp\n" - "mov %esp,%ebp\n" - "push %esi\n" "push %edi\n" - "mov 0x8(%ebp),%esi\n" - "mov 0xc(%ebp),%edi\n" - "mov 0x10(%ebp),%edx\n" - "mov 0x14(%ebp),%ecx\n" - "mov $0xff00ff,%eax\n" - "movd %eax,%xmm7\n" - "pshufd $0x0,%xmm7,%xmm7\n" + "mov 0x8(%esp),%eax\n" + "mov 0xc(%esp),%edx\n" + "mov 0x10(%esp),%edi\n" + "mov 0x14(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm1\n" - "lea 0x20(%esi),%esi\n" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" "movdqa %xmm0,%xmm2\n" "movdqa %xmm1,%xmm3\n" "pand %xmm7,%xmm0\n" "pand %xmm7,%xmm1\n" "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" "psrlw $0x8,%xmm2\n" "psrlw $0x8,%xmm3\n" "packuswb %xmm3,%xmm2\n" - "movdqa %xmm2,(%edx)\n" - "lea 0x10(%edx),%edx\n" + "movdqa %xmm2,(%edi)\n" + "lea 0x10(%edi),%edi\n" "sub $0x10,%ecx\n" "ja 1b\n" "pop %edi\n" - "pop %esi\n" - "pop %ebp\n" "ret\n" ); + +#define HAS_SPLITUV_SSSE3 +extern "C" void SplitUV_SSSE3(const uint8* src_uv, + uint8* dst_u, uint8* dst_v, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _SplitUV_SSSE3\n" +"_SplitUV_SSSE3:\n" +#else + ".global SplitUV_SSSE3\n" +"SplitUV_SSSE3:\n" +#endif + "push %edi\n" + "mov 0x8(%esp),%eax\n" + "mov 0xc(%esp),%edx\n" + "mov 0x10(%esp),%edi\n" + "mov 0x14(%esp),%ecx\n" + "movdqa _shufevenodd,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" + "pshufb %xmm7,%xmm0\n" + "pshufb %xmm7,%xmm1\n" + "movdqa %xmm0,%xmm2\n" + "punpcklqdq %xmm1,%xmm0\n" + "punpckhqdq %xmm1,%xmm2\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "movdqa %xmm2,(%edi)\n" + "lea 0x10(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %edi\n" + "ret\n" +); +#endif #endif static void SplitUV_C(const uint8* src_uv, @@ -163,70 +241,6 @@ static void I420CopyPlane2(const uint8* src, int src_pitch_0, int src_pitch_1, } } -// Support converting from FOURCC_M420 -// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for -// easy conversion to I420. -// M420 format description: -// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. -// Chroma is half width / half height. (420) -// pitch_m420 is row planar. Normally this will be the width in pixels. -// The UV plane is half width, but 2 values, so pitch_m420 applies to this -// as well as the two Y planes. -// TODO(fbarchard): Do NV21/NV12 formats with this function -static void X420ToI420(uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - const uint8* src_y, - int src_pitch_y0, int src_pitch_y1, - const uint8* src_uv, int src_pitch_uv, - int width, int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_pitch_y; - dst_u = dst_u + (height - 1) * dst_pitch_u; - dst_v = dst_v + (height - 1) * dst_pitch_v; - dst_pitch_y = -dst_pitch_y; - dst_pitch_u = -dst_pitch_u; - dst_pitch_v = -dst_pitch_v; - } - - int halfwidth = (width + 1) >> 1; - void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -#if defined(HAS_SPLITUV_NEON) - if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasNEON) && - (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { - SplitUV = SplitUV_NEON; - } else -#elif defined(HAS_SPLITUV_SSE2) - if (libyuv::CpuInfo::TestCpuFlag(libyuv::CpuInfo::kCpuHasSSE2) && - (halfwidth % 16 == 0) && - IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && - IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && - IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { - SplitUV = SplitUV_SSE2; - } else -#endif - { - SplitUV = SplitUV_C; - } - - I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y, - width, height); - - int halfheight = (height + 1) >> 1; - for (int y = 0; y < halfheight; ++y) { - // Copy a row of UV. - SplitUV(src_uv, dst_u, dst_v, halfwidth); - dst_u += dst_pitch_u; - dst_v += dst_pitch_v; - src_uv += src_pitch_uv; - } -} - // TODO(fbarchard): For biplanar formats (ie NV21), the Y plane is the same // as I420, and only the chroma plane varies. Copy the Y plane by reference, // and just convert the UV. This method can be used for NV21, NV12, I420, @@ -312,30 +326,914 @@ void PlanarFunctions::I422ToI420(const uint8* src_y, int src_pitch_y, } } +// Support converting from FOURCC_M420 +// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for +// easy conversion to I420. +// M420 format description: +// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. +// Chroma is half width / half height. (420) +// src_pitch_m420 is row planar. Normally this will be the width in pixels. +// The UV plane is half width, but 2 values, so src_pitch_m420 applies to this +// as well as the two Y planes. +// TODO(fbarchard): Do NV21/NV12 formats with this function +static void X420ToI420(const uint8* src_y, + int src_pitch_y0, int src_pitch_y1, + const uint8* src_uv, int src_pitch_uv, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_y = dst_y + (height - 1) * dst_pitch_y; + dst_u = dst_u + (height - 1) * dst_pitch_u; + dst_v = dst_v + (height - 1) * dst_pitch_v; + dst_pitch_y = -dst_pitch_y; + dst_pitch_u = -dst_pitch_u; + dst_pitch_v = -dst_pitch_v; + } + + int halfwidth = (width + 1) >> 1; + void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +#if defined(HAS_SPLITUV_NEON) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasNEON) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + SplitUV = SplitUV_NEON; + } else +#elif defined(HAS_SPLITUV_SSSE3) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSSE3) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + SplitUV = SplitUV_SSSE3; + } else +#elif defined(HAS_SPLITUV_SSE2) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) && + (halfwidth % 16 == 0) && + IS_ALIGNED(src_uv, 16) && (src_pitch_uv % 16 == 0) && + IS_ALIGNED(dst_u, 16) && (dst_pitch_u % 16 == 0) && + IS_ALIGNED(dst_v, 16) && (dst_pitch_v % 16 == 0)) { + SplitUV = SplitUV_SSE2; + } else +#endif + { + SplitUV = SplitUV_C; + } + + I420CopyPlane2(src_y, src_pitch_y0, src_pitch_y1, dst_y, dst_pitch_y, + width, height); + + int halfheight = (height + 1) >> 1; + for (int y = 0; y < halfheight; ++y) { + // Copy a row of UV. + SplitUV(src_uv, dst_u, dst_v, halfwidth); + dst_u += dst_pitch_u; + dst_v += dst_pitch_v; + src_uv += src_pitch_uv; + } +} + // Convert M420 to I420. -void PlanarFunctions::M420ToI420(uint8* dst_y, int dst_pitch_y, +void PlanarFunctions::M420ToI420(const uint8* src_m420, int src_pitch_m420, + uint8* dst_y, int dst_pitch_y, uint8* dst_u, int dst_pitch_u, uint8* dst_v, int dst_pitch_v, - const uint8* m420, int pitch_m420, int width, int height) { - X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, - m420, pitch_m420, pitch_m420 * 2, - m420 + pitch_m420 * 2, pitch_m420 * 3, + X420ToI420(src_m420, src_pitch_m420, src_pitch_m420 * 2, + src_m420 + src_pitch_m420 * 2, src_pitch_m420 * 3, + dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, width, height); } // Convert NV12 to I420. -void PlanarFunctions::NV12ToI420(uint8* dst_y, int dst_pitch_y, - uint8* dst_u, int dst_pitch_u, - uint8* dst_v, int dst_pitch_v, - const uint8* src_y, +void PlanarFunctions::NV12ToI420(const uint8* src_y, const uint8* src_uv, int src_pitch, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, int width, int height) { - X420ToI420(dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, - src_y, src_pitch, src_pitch, + X420ToI420(src_y, src_pitch, src_pitch, src_uv, src_pitch, + dst_y, dst_pitch_y, dst_u, dst_pitch_u, dst_v, dst_pitch_v, width, height); } +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_SPLITYUY2_SSE2 +__declspec(naked) +static void SplitYUY2_SSE2(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov edx, [esp + 8 + 8] // dst_y + mov esi, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + pand xmm2, xmm7 // even bytes are Y + pand xmm3, xmm7 + packuswb xmm2, xmm3 + movdqa [edx], xmm2 + lea edx, [edx + 16] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [esi], xmm0 + lea esi, [esi + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR +#define HAS_SPLITYUY2_SSE2 +extern "C" void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, + uint8* dst_u, uint8* dst_v, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _SplitYUY2_SSE2\n" +"_SplitYUY2_SSE2:\n" +#else + ".global SplitYUY2_SSE2\n" +"SplitYUY2_SSE2:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%eax\n" + "mov 0x10(%esp),%edx\n" + "mov 0x14(%esp),%esi\n" + "mov 0x18(%esp),%edi\n" + "mov 0x1c(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" + "movdqa %xmm0,%xmm2\n" + "movdqa %xmm1,%xmm3\n" + "pand %xmm7,%xmm2\n" + "pand %xmm7,%xmm3\n" + "packuswb %xmm3,%xmm2\n" + "movdqa %xmm2,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "psrlw $0x8,%xmm0\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,%xmm1\n" + "pand %xmm7,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%esi)\n" + "lea 0x8(%esi),%esi\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movq %xmm1,(%edi)\n" + "lea 0x8(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); +#endif + +static void SplitYUY2_C(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of YUY2. + for (int x = 0; x < pix; x += 2) { + dst_y[0] = src_yuy2[0]; + dst_y[1] = src_yuy2[2]; + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_y += 2; + dst_u += 1; + dst_v += 1; + } +} + +// Convert Q420 to I420. +// Format is rows of YY/YUYV +void PlanarFunctions::Q420ToI420(const uint8* src_y, int src_pitch_y, + const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + void (*SplitYUY2)(const uint8* src_yuy2, + uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); +#if defined(HAS_SPLITYUY2_SSE2) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + SplitYUY2 = SplitYUY2_SSE2; + } else +#endif + { + SplitYUY2 = SplitYUY2_C; + } + for (int y = 0; y < height; y += 2) { + memcpy(dst_y, src_y, width); + dst_y += dst_pitch_y; + src_y += src_pitch_y; + + // Copy a row of YUY2. + SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width); + dst_y += dst_pitch_y; + dst_u += dst_pitch_u; + dst_v += dst_pitch_v; + src_yuy2 += src_pitch_yuy2; + } +} + +#if defined(WIN32) && !defined(COVERAGE_ENABLED) +#define HAS_YUY2TOI420ROW_SSE2 +__declspec(naked) +void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm7 // even bytes are Y + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + ret + } +} + +__declspec(naked) +void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // pitch_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} + +#define HAS_UYVYTOI420ROW_SSE2 +__declspec(naked) +void UYVYToI420RowY_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // odd bytes are Y + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja wloop + ret + } +} + +__declspec(naked) +void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy, + uint8* dst_u, uint8* dst_y, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // pitch_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + pand xmm0, xmm7 // UYVY -> UVUV + pand xmm1, xmm7 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm7 // U + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edi], xmm1 + lea edi, [edi + 8] + sub ecx, 16 + ja wloop + + pop edi + pop esi + ret + } +} +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR + +#define HAS_YUY2TOI420ROW_SSE2 +extern "C" void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, + uint8* dst_y, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _YUY2ToI420RowY_SSE2\n" +"_YUY2ToI420RowY_SSE2:\n" +#else + ".global YUY2ToI420RowY_SSE2\n" +"YUY2ToI420RowY_SSE2:\n" +#endif + "mov 0x4(%esp),%eax\n" + "mov 0x8(%esp),%edx\n" + "mov 0xc(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "ret\n" +); + +extern "C" void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int pitch_yuy2, + uint8* dst_u, uint8* dst_y, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _YUY2ToI420RowUV_SSE2\n" +"_YUY2ToI420RowUV_SSE2:\n" +#else + ".global YUY2ToI420RowUV_SSE2\n" +"YUY2ToI420RowUV_SSE2:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%eax\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%edi\n" + "mov 0x1c(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "movdqa (%eax,%esi,1),%xmm2\n" + "movdqa 0x10(%eax,%esi,1),%xmm3\n" + "lea 0x20(%eax),%eax\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "psrlw $0x8,%xmm0\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,%xmm1\n" + "pand %xmm7,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edx)\n" + "lea 0x8(%edx),%edx\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movq %xmm1,(%edi)\n" + "lea 0x8(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); + +#define HAS_UYVYTOI420ROW_SSE2 +extern "C" void UYVYToI420RowY_SSE2(const uint8* src_uyvy, + uint8* dst_y, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _UYVYToI420RowY_SSE2\n" +"_UYVYToI420RowY_SSE2:\n" +#else + ".global UYVYToI420RowY_SSE2\n" +"UYVYToI420RowY_SSE2:\n" +#endif + "mov 0x4(%esp),%eax\n" + "mov 0x8(%esp),%edx\n" + "mov 0xc(%esp),%ecx\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "lea 0x20(%eax),%eax\n" + "psrlw $0x8,%xmm0\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,(%edx)\n" + "lea 0x10(%edx),%edx\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "ret\n" +); + +extern "C" void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int pitch_uyvy, + uint8* dst_u, uint8* dst_y, int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _UYVYToI420RowUV_SSE2\n" +"_UYVYToI420RowUV_SSE2:\n" +#else + ".global UYVYToI420RowUV_SSE2\n" +"UYVYToI420RowUV_SSE2:\n" +#endif + "push %esi\n" + "push %edi\n" + "mov 0xc(%esp),%eax\n" + "mov 0x10(%esp),%esi\n" + "mov 0x14(%esp),%edx\n" + "mov 0x18(%esp),%edi\n" + "mov 0x1c(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "psrlw $0x8,%xmm7\n" + +"1:" + "movdqa (%eax),%xmm0\n" + "movdqa 0x10(%eax),%xmm1\n" + "movdqa (%eax,%esi,1),%xmm2\n" + "movdqa 0x10(%eax,%esi,1),%xmm3\n" + "lea 0x20(%eax),%eax\n" + "pavgb %xmm2,%xmm0\n" + "pavgb %xmm3,%xmm1\n" + "pand %xmm7,%xmm0\n" + "pand %xmm7,%xmm1\n" + "packuswb %xmm1,%xmm0\n" + "movdqa %xmm0,%xmm1\n" + "pand %xmm7,%xmm0\n" + "packuswb %xmm0,%xmm0\n" + "movq %xmm0,(%edx)\n" + "lea 0x8(%edx),%edx\n" + "psrlw $0x8,%xmm1\n" + "packuswb %xmm1,%xmm1\n" + "movq %xmm1,(%edi)\n" + "lea 0x8(%edi),%edi\n" + "sub $0x10,%ecx\n" + "ja 1b\n" + "pop %edi\n" + "pop %esi\n" + "ret\n" +); +#endif + +void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of yuy2 UV values + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_yuy2[1] + src_yuy2[src_pitch_yuy2 + 1] + 1) >> 1; + dst_v[0] = (src_yuy2[3] + src_yuy2[src_pitch_yuy2 + 3] + 1) >> 1; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +void YUY2ToI420RowY_C(const uint8* src_yuy2, + uint8* dst_y, int pix) { + // Copy a row of yuy2 Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_yuy2[0]; + src_yuy2 += 2; + dst_y += 1; + } +} + +void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_pitch_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + // Copy a row of uyvy UV values + for (int x = 0; x < pix; x += 2) { + dst_u[0] = (src_uyvy[0] + src_uyvy[src_pitch_uyvy + 0] + 1) >> 1; + dst_v[0] = (src_uyvy[2] + src_uyvy[src_pitch_uyvy + 2] + 1) >> 1; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +void UYVYToI420RowY_C(const uint8* src_uyvy, + uint8* dst_y, int pix) { + // Copy a row of uyvy Y values + for (int x = 0; x < pix; ++x) { + dst_y[0] = src_uyvy[1]; + src_uyvy += 2; + dst_y += 1; + } +} + +// Convert YUY2 to I420. +void PlanarFunctions::YUY2ToI420(const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_pitch_yuy2, + uint8* dst_u, uint8* dst_v, int pix); + void (*YUY2ToI420RowY)(const uint8* src_yuy2, + uint8* dst_y, int pix); +#if defined(HAS_YUY2TOI420ROW_SSE2) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_yuy2, 16) && (src_pitch_yuy2 % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + YUY2ToI420RowY = YUY2ToI420RowY_SSE2; + YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; + } else +#endif + { + YUY2ToI420RowY = YUY2ToI420RowY_C; + YUY2ToI420RowUV = YUY2ToI420RowUV_C; + } + for (int y = 0; y < height; ++y) { + if ((y & 1) == 0) { + if (y >= (height - 1) ) { // last chroma on odd height clamp height + src_pitch_yuy2 = 0; + } + YUY2ToI420RowUV(src_yuy2, src_pitch_yuy2, dst_u, dst_v, width); + dst_u += dst_pitch_u; + dst_v += dst_pitch_v; + } + YUY2ToI420RowY(src_yuy2, dst_y, width); + dst_y += dst_pitch_y; + src_yuy2 += src_pitch_yuy2; + } +} + +// Convert UYVY to I420. +void PlanarFunctions::UYVYToI420(const uint8* src_uyvy, int src_pitch_uyvy, + uint8* dst_y, int dst_pitch_y, + uint8* dst_u, int dst_pitch_u, + uint8* dst_v, int dst_pitch_v, + int width, int height) { + void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_pitch_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void (*UYVYToI420RowY)(const uint8* src_uyvy, + uint8* dst_y, int pix); +#if defined(HAS_UYVYTOI420ROW_SSE2) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) && + (width % 16 == 0) && + IS_ALIGNED(src_uyvy, 16) && (src_pitch_uyvy % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_pitch_y % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_pitch_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_pitch_v % 8 == 0)) { + UYVYToI420RowY = UYVYToI420RowY_SSE2; + UYVYToI420RowUV = UYVYToI420RowUV_SSE2; + } else +#endif + { + UYVYToI420RowY = UYVYToI420RowY_C; + UYVYToI420RowUV = UYVYToI420RowUV_C; + } + for (int y = 0; y < height; ++y) { + if ((y & 1) == 0) { + if (y >= (height - 1) ) { // last chroma on odd height clamp height + src_pitch_uyvy = 0; + } + UYVYToI420RowUV(src_uyvy, src_pitch_uyvy, dst_u, dst_v, width); + dst_u += dst_pitch_u; + dst_v += dst_pitch_v; + } + UYVYToI420RowY(src_uyvy, dst_y, width); + dst_y += dst_pitch_y; + src_uyvy += src_pitch_uyvy; + } +} + +// Convert I420 to ARGB. +// TODO(fbarchard): Add SSSE3 version and supply C version for fallback. +void PlanarFunctions::I420ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_pitch_argb; + src_y += src_pitch_y; + if (y & 1) { + src_u += src_pitch_u; + src_v += src_pitch_v; + } + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); +} + +// Convert I422 to ARGB. +void PlanarFunctions::I422ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_pitch_argb; + src_y += src_pitch_y; + src_u += src_pitch_u; + src_v += src_pitch_v; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); +} + +// Convert I444 to ARGB. +void PlanarFunctions::I444ToARGB(const uint8* src_y, int src_pitch_y, + const uint8* src_u, int src_pitch_u, + const uint8* src_v, int src_pitch_v, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_pitch_argb; + src_y += src_pitch_y; + src_u += src_pitch_u; + src_v += src_pitch_v; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); +} + +// Convert I400 to ARGB. +void PlanarFunctions::I400ToARGB_Reference(const uint8* src_y, int src_pitch_y, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + FastConvertYToRGB32Row(src_y, dst_argb, width); + dst_argb += dst_pitch_argb; + src_y += src_pitch_y; + } + // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + EMMS(); +} + +// TODO(fbarchard): 64 bit version +#if defined(WIN32) && !defined(COVERAGE_ENABLED) + +#define HAS_I400TOARGBROW_SSE2 +__declspec(naked) +static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + + wloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm7 + por xmm1, xmm7 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + ja wloop + ret + } +} + +#elif defined(__i386__) && !defined(COVERAGE_ENABLED) && \ + !TARGET_IPHONE_SIMULATOR + +#define HAS_I400TOARGBROW_SSE2 +extern "C" void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, + int pix); + asm( + ".text\n" +#if defined(OSX) + ".globl _I400ToARGBRow_SSE2\n" +"_I400ToARGBRow_SSE2:\n" +#else + ".global I400ToARGBRow_SSE2\n" +"I400ToARGBRow_SSE2:\n" +#endif + "mov 0x4(%esp),%eax\n" + "mov 0x8(%esp),%edx\n" + "mov 0xc(%esp),%ecx\n" + "pcmpeqb %xmm7,%xmm7\n" + "pslld $0x18,%xmm7\n" +"1:" + "movq (%eax),%xmm0\n" + "lea 0x8(%eax),%eax\n" + "punpcklbw %xmm0,%xmm0\n" + "movdqa %xmm0,%xmm1\n" + "punpcklwd %xmm0,%xmm0\n" + "punpckhwd %xmm1,%xmm1\n" + "por %xmm7,%xmm0\n" + "por %xmm7,%xmm1\n" + "movdqa %xmm0,(%edx)\n" + "movdqa %xmm1,0x10(%edx)\n" + "lea 0x20(%edx),%edx\n" + "sub $0x8,%ecx\n" + "ja 1b\n" + "ret\n" +); +#endif + +static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { + // Copy a Y to RGB. + for (int x = 0; x < pix; ++x) { + dst_argb[2] = dst_argb[1] = dst_argb[0] = src_y[0]; + dst_argb[3] = 255u; + dst_argb += 4; + src_y += 1; + } +} + +// Convert I400 to ARGB. +void PlanarFunctions::I400ToARGB(const uint8* src_y, int src_pitch_y, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); +#if defined(HAS_I400TOARGBROW_SSE2) + if (talk_base::CpuInfo::TestCpuFlag(talk_base::CpuInfo::kCpuHasSSE2) && + (width % 8 == 0) && + IS_ALIGNED(src_y, 8) && (src_pitch_y % 8 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_pitch_argb % 16 == 0)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } else +#endif + { + I400ToARGBRow = I400ToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + I400ToARGBRow(src_y, dst_argb, width); + src_y += src_pitch_y; + dst_argb += dst_pitch_argb; + } +} + +static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + dst_argb[0] = src_raw[2]; + dst_argb[1] = src_raw[1]; + dst_argb[2] = src_raw[0]; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +// Convert RAW to ARGB. +void PlanarFunctions::RAWToARGB(const uint8* src_raw, int src_pitch_raw, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + RAWToARGBRow_C(src_raw, dst_argb, width); + src_raw += src_pitch_raw; + dst_argb += dst_pitch_argb; + } +} + +static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + dst_argb[0] = src_bg24[0]; + dst_argb[1] = src_bg24[1]; + dst_argb[2] = src_bg24[2]; + dst_argb[3] = 255u; + dst_argb += 4; + src_bg24 += 3; + } +} + +// Convert BG24 to ARGB. +void PlanarFunctions::BG24ToARGB(const uint8* src_bg24, int src_pitch_bg24, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + BG24ToARGBRow_C(src_bg24, dst_argb, width); + src_bg24 += src_pitch_bg24; + dst_argb += dst_pitch_argb; + } +} + +static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + dst_argb[0] = src_abgr[2]; + dst_argb[1] = src_abgr[1]; + dst_argb[2] = src_abgr[0]; + dst_argb[3] = src_abgr[3]; + dst_argb += 4; + src_abgr += 4; + } +} + +// Convert ABGR to ARGB. +void PlanarFunctions::ABGRToARGB(const uint8* src_abgr, int src_pitch_abgr, + uint8* dst_argb, int dst_pitch_argb, + int width, int height) { + for (int y = 0; y < height; ++y) { + ABGRToARGBRow_C(src_abgr, dst_argb, width); + src_abgr += src_pitch_abgr; + dst_argb += dst_pitch_argb; + } +} + } // namespace libyuv diff --git a/source/row.h b/source/row.h index 4d50bf60d..48bb8440e 100644 --- a/source/row.h +++ b/source/row.h @@ -22,6 +22,16 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, uint8* rgb_buf, int width); +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width); + #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var #else @@ -68,6 +78,7 @@ extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); #define EMMS() #endif + } // extern "C" #endif // LIBYUV_SOURCE_ROW_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc index 61d89a167..44c89dabd 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -55,6 +55,68 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi ); } +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm( +"1:" + "movzb (%1),%%r10\n" + "lea 1(%1),%1\n" + "movzb (%2),%%r11\n" + "lea 1(%2),%2\n" + "movq 2048(%5,%%r10,8),%%xmm0\n" + "movzb (%0),%%r10\n" + "movq 4096(%5,%%r11,8),%%xmm1\n" + "paddsw %%xmm1,%%xmm0\n" + "movq (%5,%%r10,8),%%xmm2\n" + "lea 1(%0),%0\n" + "paddsw %%xmm0,%%xmm2\n" + "shufps $0x44,%%xmm2,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movd %%xmm2,0x0(%3)\n" + "lea 4(%3),%3\n" + "sub $0x1,%4\n" + "ja 1b\n" + : + : "r"(y_buf), // %0 + "r"(u_buf), // %1 + "r"(v_buf), // %2 + "r"(rgb_buf), // %3 + "r"(width), // %4 + "r" (_kCoefficientsRgbY) // %5 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" +); +} + +void FastConvertYToRGB32Row(const uint8* y_buf, // rdi + uint8* rgb_buf, // rcx + int width) { // r8 + asm( +"1:" + "movzb (%0),%%r10\n" + "movzb 0x1(%0),%%r11\n" + "movq (%3,%%r10,8),%%xmm2\n" + "lea 2(%0),%0\n" + "movq (%3,%%r11,8),%%xmm3\n" + "shufps $0x44,%%xmm3,%%xmm2\n" + "psraw $0x6,%%xmm2\n" + "packuswb %%xmm2,%%xmm2\n" + "movq %%xmm2,0x0(%1)\n" + "lea 8(%1),%1\n" + "sub $0x2,%2\n" + "ja 1b\n" + : + : "r"(y_buf), // %0 + "r"(rgb_buf), // %1 + "r"(width), // %2 + "r" (_kCoefficientsRgbY) // %3 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" +); +} + #elif defined(__i386__) // 32 bit gcc version @@ -104,6 +166,81 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, "ret\n" ); +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUV444ToRGB32Row\n" +"_FastConvertYUV444ToRGB32Row:\n" +#else + ".global FastConvertYUV444ToRGB32Row\n" +"FastConvertYUV444ToRGB32Row:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "lea 1(%edx),%edx\n" + "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n" + "psraw $0x6,%mm0\n" + "packuswb %mm0,%mm0\n" + "movd %mm0,0x0(%ebp)\n" + "lea 4(%ebp),%ebp\n" + "sub $0x1,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYToRGB32Row\n" +"_FastConvertYToRGB32Row:\n" +#else + ".global FastConvertYToRGB32Row\n" +"FastConvertYToRGB32Row:\n" +#endif + "push %ebx\n" + "mov 0x8(%esp),%eax\n" + "mov 0xc(%esp),%edx\n" + "mov 0x10(%esp),%ecx\n" + +"1:" + "movzbl (%eax),%ebx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n" + "psraw $0x6,%mm0\n" + "movzbl 0x1(%eax),%ebx\n" + "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n" + "psraw $0x6,%mm1\n" + "packuswb %mm1,%mm0\n" + "lea 0x2(%eax),%eax\n" + "movq %mm0,(%edx)\n" + "lea 0x8(%edx),%edx\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "pop %ebx\n" + "ret\n" +); + #else // C reference code that mimic the YUV assembly. #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) @@ -158,6 +295,30 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, rgb_buf += 8; // Advance 2 pixels. } } -#endif +void FastConvertYUV444ToRGB32Row(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + uint8 u = u_buf[x]; + uint8 v = v_buf[x]; + uint8 y = y_buf[x]; + YuvPixel(y, u, v, rgb_buf); + rgb_buf += 4; // Advance 1 pixel. + } +} + +void FastConvertYToRGB32Row(const uint8* y_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + uint8 y = y_buf[x]; + YuvPixel(y, 128, 128, rgb_buf); + rgb_buf += 4; // Advance 1 pixel. + } +} + +#endif } // extern "C" diff --git a/source/row_table.cc b/source/row_table.cc index 6a97da831..71ee07308 100644 --- a/source/row_table.cc +++ b/source/row_table.cc @@ -16,14 +16,14 @@ extern "C" { static_cast(1.164 * 64 * (i - 16) + 0.5), \ static_cast(1.164 * 64 * (i - 16) + 0.5), \ static_cast(1.164 * 64 * (i - 16) + 0.5), \ - 0 \ + static_cast(256 * 64 - 1) \ } #define RGBU(i) { \ static_cast(2.018 * 64 * (i - 128) + 0.5), \ static_cast(-0.391 * 64 * (i - 128) + 0.5), \ 0, \ - static_cast(256 * 64 - 1) \ + 0 \ } #define RGBV(i) { \