diff --git a/include/libyuv/basic_types.h b/include/libyuv/basic_types.h index 5adc2bfdb..99bee1385 100644 --- a/include/libyuv/basic_types.h +++ b/include/libyuv/basic_types.h @@ -19,15 +19,6 @@ #ifndef INT_TYPES_DEFINED #define INT_TYPES_DEFINED -#ifdef COMPILER_MSVC -typedef __int64 int64; -#else -typedef long long int64; -#endif /* COMPILER_MSVC */ -typedef int int32; -typedef short int16; -typedef char int8; - #ifdef COMPILER_MSVC typedef unsigned __int64 uint64; typedef __int64 int64; @@ -38,7 +29,18 @@ typedef __int64 int64; #define UINT64_C(x) x ## UI64 #endif #define INT64_F "I64" -#else +#else // COMPILER_MSVC +#ifdef __LP64__ +typedef unsigned long uint64; +typedef long int64; +#ifndef INT64_C +#define INT64_C(x) x ## L +#endif +#ifndef UINT64_C +#define UINT64_C(x) x ## UL +#endif +#define INT64_F "l" +#else // __LP64__ typedef unsigned long long uint64; typedef long long int64; #ifndef INT64_C @@ -48,10 +50,14 @@ typedef long long int64; #define UINT64_C(x) x ## ULL #endif #define INT64_F "ll" -#endif /* COMPILER_MSVC */ +#endif // __LP64__ +#endif // COMPILER_MSVC typedef unsigned int uint32; +typedef int int32; typedef unsigned short uint16; +typedef short int16; typedef unsigned char uint8; +typedef char int8; #endif // INT_TYPES_DEFINED // Detect compiler is for x86 or x64. diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index fa3b64463..0b95c51cc 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -13,6 +13,7 @@ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" +#include "libyuv/rotate.h" namespace libyuv { @@ -92,6 +93,17 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); -} // namespace libyuv +// Convert camera sample to I420 with cropping, rotation and vertical flip. +int ConvertToI420(const uint8* src_frame, size_t src_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int horiz_crop, int vert_crop, + int w, int h, + int dw, int idh, + RotationMode rotation, + uint32 format); + +} // namespace libyuv #endif // INCLUDE_LIBYUV_CONVERT_H_ diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index c1000e867..263cd083c 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -27,7 +27,9 @@ static const int kCpuInitialized = 8; bool TestCpuFlag(int flag); // For testing, allow CPU flags to be disabled. -// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all. +// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. +// -1 to enable all cpu specific optimizations. +// 0 to disable all cpu specific optimizations. void MaskCpuFlags(int enable_flags); } // namespace libyuv diff --git a/source/convert.cc b/source/convert.cc index 8154dcb78..485457362 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -13,7 +13,11 @@ #include "conversion_tables.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" #include "row.h" +#include "video_common.h" //#define SCALEOPT //Currently for windows only. June 2010 @@ -650,7 +654,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_ARGBTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -661,7 +665,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, ARGBToYRow = ARGBToYRow_C; } #if defined(HAS_ARGBTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && @@ -703,7 +707,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_BGRATOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -714,7 +718,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, ARGBToYRow = BGRAToYRow_C; } #if defined(HAS_BGRATOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && @@ -756,7 +760,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_ABGRTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -767,7 +771,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, ARGBToYRow = ABGRToYRow_C; } #if defined(HAS_ABGRTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && @@ -809,7 +813,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_RGB24TOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -820,7 +824,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame, ARGBToYRow = RGB24ToYRow_C; } #if defined(HAS_RGB24TOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && @@ -862,7 +866,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_RAWTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -873,7 +877,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, ARGBToYRow = RAWToYRow_C; } #if defined(HAS_RAWTOUVROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && @@ -901,4 +905,163 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame, return 0; } +// Convert camera sample to I420 with cropping, rotation and vertical flip. +int ConvertToI420(const uint8* sample, size_t sample_size, + uint8* y, int y_stride, + uint8* u, int u_stride, + uint8* v, int v_stride, + int horiz_crop, int vert_crop, + int w, int h, + int dw, int idh, + RotationMode rotation, + uint32 format) { + int aw = (w + 1) & ~1; + const uint8* src; + const uint8* src_uv; + int abs_h = (h < 0) ? -h : h; + switch (format) { + // Single plane formats + case FOURCC_YUY2: + src = sample + (aw * vert_crop + horiz_crop) * 2 ; + YUY2ToI420(src, aw * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_UYVY: + src = sample + (aw * vert_crop + horiz_crop) * 2; + UYVYToI420(src, aw * 2, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_24BG: + src = sample + (w * vert_crop + horiz_crop) * 3; + RGB24ToI420(src, w * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_RAW: + src = sample + (w * vert_crop + horiz_crop) * 3; + RAWToI420(src, w * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_ARGB: + src = sample + (w * vert_crop + horiz_crop) * 4; + ARGBToI420(src, w * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_BGRA: + src = sample + (w * vert_crop + horiz_crop) * 4; + BGRAToI420(src, w * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_ABGR: + src = sample + (w * vert_crop + horiz_crop) * 4; + ABGRToI420(src, w * 4, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_BGGR: + case FOURCC_RGGB: + case FOURCC_GRBG: + case FOURCC_GBRG: + // TODO(fbarchard): We could support cropping by odd numbers by + // adjusting fourcc. + src = sample + (w * vert_crop + horiz_crop); + BayerRGBToI420(src, w, format, + y, y_stride, u, u_stride, v, v_stride, + dw, idh); + break; + // Biplanar formats + case FOURCC_M420: + src = sample + (w * vert_crop) * 12 / 8 + horiz_crop; + M420ToI420(src, w, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + case FOURCC_NV12: + src = sample + (w * vert_crop + horiz_crop); + src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop; + NV12ToI420Rotate(src, w, + src_uv, aw, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh, rotation); + break; + case FOURCC_NV21: + src = sample + (w * vert_crop + horiz_crop); + src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop; + // Call NV12 but with u and v parameters swapped. + NV12ToI420Rotate(src, w, + src_uv, aw, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh, rotation); + break; + case FOURCC_Q420: + src = sample + (w + aw * 2) * vert_crop + horiz_crop; + src_uv = sample + (w + aw * 2) * vert_crop + w + horiz_crop * 2; + Q420ToI420(src, w * 3, + src_uv, w * 3, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh); + break; + // Triplanar formats + case FOURCC_I420: + case FOURCC_YV12: { + const uint8* src_y = sample + (w * vert_crop + horiz_crop); + const uint8* src_u; + const uint8* src_v; + int halfwidth = (w + 1) / 2; + int halfheight = (abs_h + 1) / 2; + if (format == FOURCC_I420) { + src_u = sample + w * abs_h + + (halfwidth * vert_crop + horiz_crop) / 2; + src_v = sample + w * abs_h + + halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2; + } else { + src_v = sample + w * abs_h + + (halfwidth * vert_crop + horiz_crop) / 2; + src_u = sample + w * abs_h + + halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2; + } + I420Rotate(src_y, w, + src_u, halfwidth, + src_v, halfwidth, + y, y_stride, + u, u_stride, + v, v_stride, + dw, idh, rotation); + break; + } + // Formats not supported + case FOURCC_MJPG: + default: + return -1; // unknown fourcc - return failure code. + } + return 0; +} + } // namespace libyuv diff --git a/source/cpu_id.cc b/source/cpu_id.cc index cc44e2158..9903c4a87 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -14,11 +14,14 @@ #ifdef _MSC_VER #include #endif +#ifdef __ANDROID__ +#include +#endif // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static inline void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( + asm volatile ( "mov %%ebx, %%edi\n" "cpuid\n" "xchg %%edi, %%ebx\n" @@ -28,7 +31,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) { } #elif defined(__i386__) || defined(__x86_64__) static inline void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( + asm volatile ( "cpuid\n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type) @@ -49,6 +52,10 @@ static void InitCpuFlags() { cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | kCpuInitialized; +#elif defined(__ANDROID__) && defined(__ARM_NEON__) + features = android_getCpuFeatures(); + cpu_info_ = (features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) | + kCpuInitialized; #elif defined(__ARM_NEON__) // gcc -mfpu=neon defines __ARM_NEON__ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags @@ -61,14 +68,14 @@ static void InitCpuFlags() { void MaskCpuFlags(int enable_flags) { InitCpuFlags(); - cpu_info_ &= enable_flags; + cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; } bool TestCpuFlag(int flag) { if (0 == cpu_info_) { InitCpuFlags(); } - return cpu_info_ & flag ? true : false; + return (cpu_info_ & flag) ? true : false; } } // namespace libyuv diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 2f2eac6ec..303eb5913 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -14,8 +14,6 @@ #include "video_common.h" #include "row.h" -#define kMaxStride (2048 * 4) - namespace libyuv { // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers @@ -168,7 +166,7 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 g = src_bayer0[1]; uint8 r = src_bayer1[1]; - for (int x = 0; x < (pix - 2); x += 2) { + for (int x = 0; x < pix - 3; x += 2) { dst_rgb[0] = src_bayer0[0]; dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[2] = AVG(r, src_bayer1[1]); @@ -187,10 +185,12 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer, dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[2] = AVG(r, src_bayer1[1]); dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer0[0]; - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = src_bayer1[1]; - dst_rgb[7] = 255U; + if (pix & 1) { + dst_rgb[4] = src_bayer0[0]; + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = src_bayer1[1]; + dst_rgb[7] = 255U; + } } static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, @@ -198,7 +198,7 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 g = src_bayer0[1]; uint8 b = src_bayer1[1]; - for (int x = 0; x < (pix - 2); x += 2) { + for (int x = 0; x < pix - 3; x += 2) { dst_rgb[0] = AVG(b, src_bayer1[1]); dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[2] = src_bayer0[0]; @@ -217,17 +217,19 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer, dst_rgb[1] = AVG(g, src_bayer0[1]); dst_rgb[2] = src_bayer0[0]; dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer1[1]; - dst_rgb[5] = src_bayer0[1]; - dst_rgb[6] = src_bayer0[0]; - dst_rgb[7] = 255U; + if (pix & 1) { + dst_rgb[4] = src_bayer1[1]; + dst_rgb[5] = src_bayer0[1]; + dst_rgb[6] = src_bayer0[0]; + dst_rgb[7] = 255U; + } } static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, uint8* dst_rgb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 b = src_bayer0[1]; - for (int x = 0; x < (pix - 2); x += 2) { + for (int x = 0; x < pix - 3; x += 2) { dst_rgb[0] = AVG(b, src_bayer0[1]); dst_rgb[1] = src_bayer0[0]; dst_rgb[2] = src_bayer1[0]; @@ -245,17 +247,19 @@ static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer, dst_rgb[1] = src_bayer0[0]; dst_rgb[2] = src_bayer1[0]; dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer0[1]; - dst_rgb[5] = src_bayer0[0]; - dst_rgb[6] = src_bayer1[0]; - dst_rgb[7] = 255U; + if (pix & 1) { + dst_rgb[4] = src_bayer0[1]; + dst_rgb[5] = src_bayer0[0]; + dst_rgb[6] = src_bayer1[0]; + dst_rgb[7] = 255U; + } } static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, uint8* dst_rgb, int pix) { const uint8* src_bayer1 = src_bayer0 + src_stride_bayer; uint8 r = src_bayer0[1]; - for (int x = 0; x < (pix - 2); x += 2) { + for (int x = 0; x < pix - 3; x += 2) { dst_rgb[0] = src_bayer1[0]; dst_rgb[1] = src_bayer0[0]; dst_rgb[2] = AVG(r, src_bayer0[1]); @@ -273,10 +277,12 @@ static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer, dst_rgb[1] = src_bayer0[0]; dst_rgb[2] = AVG(r, src_bayer0[1]); dst_rgb[3] = 255U; - dst_rgb[4] = src_bayer1[0]; - dst_rgb[5] = src_bayer0[0]; - dst_rgb[6] = src_bayer0[1]; - dst_rgb[7] = 255U; + if (pix & 1) { + dst_rgb[4] = src_bayer1[0]; + dst_rgb[5] = src_bayer0[0]; + dst_rgb[6] = src_bayer0[1]; + dst_rgb[7] = 255U; + } } // Converts any Bayer RGB format to ARGB. @@ -315,7 +321,7 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer, break; } - for (int y = 0; y < (height - 1); y += 2) { + for (int y = 0; y < height - 1; y += 2) { BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, dst_rgb + dst_stride_rgb, width); @@ -403,7 +409,7 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, break; } - for (int y = 0; y < (height - 1); y += 2) { + for (int y = 0; y < height - 1; y += 2) { BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, row + kMaxStride, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index a7e3e38a6..e7a58717d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -26,11 +26,11 @@ static void SplitUV_NEON(const uint8* src_uv, __asm__ volatile ( "1:\n" - "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV - "vst1.u8 {q0}, [%1]! \n" // store U - "vst1.u8 {q1}, [%2]! \n" // Store V - "subs %3, %3, #16 \n" // 16 processed per loop - "bhi 1b \n" + "vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV + "vst1.u8 {q0}, [%1]!\n" // store U + "vst1.u8 {q1}, [%2]!\n" // Store V + "subs %3, %3, #16\n" // 16 processed per loop + "bhi 1b\n" : "+r"(src_uv), "+r"(dst_u), "+r"(dst_v), @@ -48,16 +48,6 @@ static void SplitUV_NEON(const uint8* src_uv, #define TALIGN16(t, var) t var __attribute__((aligned(16))) #endif -// Shuffle table for converting ABGR to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; - -// Shuffle table for converting BGRA to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; - #if defined(WIN32) && !defined(COVERAGE_ENABLED) #define HAS_SPLITUV_SSE2 __declspec(naked) @@ -69,8 +59,8 @@ static void SplitUV_SSE2(const uint8* src_uv, mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 wloop: movdqa xmm0, [eax] @@ -78,8 +68,8 @@ static void SplitUV_SSE2(const uint8* src_uv, lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 - pand xmm0, xmm7 // even bytes - pand xmm1, xmm7 + pand xmm0, xmm5 // even bytes + pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa [edx], xmm0 lea edx, [edx + 16] @@ -101,16 +91,16 @@ static void SplitUV_SSE2(const uint8* src_uv, static void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" + "pcmpeqb %%xmm5,%%xmm5\n" + "psrlw $0x8,%%xmm5\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" "lea 0x20(%0),%0\n" "movdqa %%xmm0,%%xmm2\n" "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" + "pand %%xmm5,%%xmm0\n" + "pand %%xmm5,%%xmm1\n" "packuswb %%xmm1,%%xmm0\n" "movdqa %%xmm0,(%1)\n" "lea 0x10(%1),%1\n" @@ -126,7 +116,10 @@ static void SplitUV_SSE2(const uint8* src_uv, "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif ); } #endif @@ -196,15 +189,15 @@ int I420Copy(const uint8* src_y, int src_stride_y, static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { __asm__ volatile ( - "vdup.u32 q0, %2 \n" // duplicate 4 ints + "vdup.u32 q0, %2\n" // duplicate 4 ints "1:\n" - "vst1.u32 {q0}, [%0]! \n" // store - "subs %1, %1, #16 \n" // 16 processed per loop - "bhi 1b \n" + "vst1.u32 {q0}, [%0]!\n" // store + "subs %1, %1, #16\n" // 16 processed per loop + "bhi 1b\n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 - : "q0", "memory" + : "q0", "memory", "cc" ); } @@ -214,12 +207,12 @@ __declspec(naked) static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { __asm { mov eax, [esp + 4] // dst - movd xmm7, [esp + 8] // v32 + movd xmm5, [esp + 8] // v32 mov ecx, [esp + 12] // count - pshufd xmm7, xmm7, 0 + pshufd xmm5, xmm5, 0 wloop: - movdqa [eax], xmm7 + movdqa [eax], xmm5 lea eax, [eax + 16] sub ecx, 16 ja wloop @@ -233,17 +226,20 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { #define HAS_SETROW_SSE2 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { asm volatile( - "movd %2, %%xmm7\n" - "pshufd $0x0,%%xmm7,%%xmm7\n" + "movd %2, %%xmm5\n" + "pshufd $0x0,%%xmm5,%%xmm5\n" "1:" - "movdqa %%xmm7,(%0)\n" + "movdqa %%xmm5,(%0)\n" "lea 0x10(%0),%0\n" "sub $0x10,%1\n" "ja 1b\n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm5" +#endif ); } #endif @@ -257,13 +253,13 @@ static void I420SetPlane(uint8* dst_y, int dst_stride_y, int value) { void (*SetRow)(uint8* dst, uint32 value, int pix); #if defined(HAS_SETROW_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + if (TestCpuFlag(kCpuHasNEON) && (width % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { SetRow = SetRow32_NEON; } else #elif defined(HAS_SETROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { SetRow = SetRow32_SSE2; @@ -418,7 +414,7 @@ static int X420ToI420(const uint8* src_y, int halfwidth = (width + 1) >> 1; void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); #if defined(HAS_SPLITUV_NEON) - if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + if (TestCpuFlag(kCpuHasNEON) && (halfwidth % 16 == 0) && IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && @@ -426,7 +422,7 @@ static int X420ToI420(const uint8* src_y, SplitUV = SplitUV_NEON; } else #elif defined(HAS_SPLITUV_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (halfwidth % 16 == 0) && IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && @@ -510,8 +506,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, mov esi, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 wloop: movdqa xmm0, [eax] @@ -519,8 +515,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 - pand xmm2, xmm7 // even bytes are Y - pand xmm3, xmm7 + pand xmm2, xmm5 // even bytes are Y + pand xmm3, xmm5 packuswb xmm2, xmm3 movdqa [edx], xmm2 lea edx, [edx + 16] @@ -528,7 +524,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 - pand xmm0, xmm7 // U + pand xmm0, xmm5 // U packuswb xmm0, xmm0 movq qword ptr [esi], xmm0 lea esi, [esi + 8] @@ -551,16 +547,16 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" + "pcmpeqb %%xmm5,%%xmm5\n" + "psrlw $0x8,%%xmm5\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" "lea 0x20(%0),%0\n" "movdqa %%xmm0,%%xmm2\n" "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm7,%%xmm2\n" - "pand %%xmm7,%%xmm3\n" + "pand %%xmm5,%%xmm2\n" + "pand %%xmm5,%%xmm3\n" "packuswb %%xmm3,%%xmm2\n" "movdqa %%xmm2,(%1)\n" "lea 0x10(%1),%1\n" @@ -568,7 +564,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, "psrlw $0x8,%%xmm1\n" "packuswb %%xmm1,%%xmm0\n" "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" + "pand %%xmm5,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n" "movq %%xmm0,(%2)\n" "lea 0x8(%2),%2\n" @@ -584,7 +580,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif ); } #endif @@ -626,7 +625,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, void (*SplitYUY2)(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); #if defined(HAS_SPLITYUY2_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 16 == 0) && IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && @@ -662,15 +661,15 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm7 // even bytes are Y - pand xmm1, xmm7 + pand xmm0, xmm5 // even bytes are Y + pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa [edx], xmm0 lea edx, [edx + 16] @@ -691,8 +690,8 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 wloop: movdqa xmm0, [eax] @@ -706,7 +705,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 - pand xmm0, xmm7 // U + pand xmm0, xmm5 // U packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] @@ -758,8 +757,8 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 wloop: movdqa xmm0, [eax] @@ -769,11 +768,11 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - pand xmm0, xmm7 // UYVY -> UVUV - pand xmm1, xmm7 + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 - pand xmm0, xmm7 // U + pand xmm0, xmm5 // U packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] @@ -797,14 +796,14 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" + "pcmpeqb %%xmm5,%%xmm5\n" + "psrlw $0x8,%%xmm5\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" "lea 0x20(%0),%0\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" + "pand %%xmm5,%%xmm0\n" + "pand %%xmm5,%%xmm1\n" "packuswb %%xmm1,%%xmm0\n" "movdqa %%xmm0,(%1)\n" "lea 0x10(%1),%1\n" @@ -814,15 +813,18 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif ); } static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_y, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" + "pcmpeqb %%xmm5,%%xmm5\n" + "psrlw $0x8,%%xmm5\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" @@ -835,7 +837,7 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, "psrlw $0x8,%%xmm1\n" "packuswb %%xmm1,%%xmm0\n" "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" + "pand %%xmm5,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n" "movq %%xmm0,(%1)\n" "lea 0x8(%1),%1\n" @@ -850,7 +852,10 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, "+r"(dst_y), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_yuy2)) // %4 - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif ); } #define HAS_UYVYTOI420ROW_SSE2 @@ -872,15 +877,18 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif ); } static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_y, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" + "pcmpeqb %%xmm5,%%xmm5\n" + "psrlw $0x8,%%xmm5\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" @@ -889,11 +897,11 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, "lea 0x20(%0),%0\n" "pavgb %%xmm2,%%xmm0\n" "pavgb %%xmm3,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" - "pand %%xmm7,%%xmm1\n" + "pand %%xmm5,%%xmm0\n" + "pand %%xmm5,%%xmm1\n" "packuswb %%xmm1,%%xmm0\n" "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm7,%%xmm0\n" + "pand %%xmm5,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n" "movq %%xmm0,(%1)\n" "lea 0x8(%1),%1\n" @@ -908,7 +916,10 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, "+r"(dst_y), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_uyvy)) // %4 - : "memory" + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif ); } #endif @@ -975,7 +986,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, void (*YUY2ToI420RowY)(const uint8* src_yuy2, uint8* dst_y, int pix); #if defined(HAS_YUY2TOI420ROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 16 == 0) && IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && @@ -1022,7 +1033,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, void (*UYVYToI420RowY)(const uint8* src_uyvy, uint8* dst_y, int pix); #if defined(HAS_UYVYTOI420ROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 16 == 0) && IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && @@ -1053,7 +1064,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } // Convert I420 to ARGB. -// TODO(fbarchard): Add SSE2 version and supply C version for fallback. int I420ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -1065,8 +1075,34 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 4 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 2 == 0)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) + if (width % 2 == 0) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX; + } else +#endif + { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + } for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1074,7 +1110,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, src_v += src_stride_v; } } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + // MMX used for FastConvertYUVToARGBRow requires an emms instruction. EMMS(); return 0; } @@ -1091,6 +1127,25 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYUVToBGRARow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 2 == 0)) { + FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX) + if (width % 2 == 0) { + FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX; + } else +#endif + { + FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C; + } for (int y = 0; y < height; ++y) { FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; @@ -1104,7 +1159,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, return 0; } -// Convert I420 to BGRA. +// Convert I420 to ABGR. int I420ToABGR(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -1116,6 +1171,25 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYUVToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 2 == 0)) { + FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX) + if (width % 2 == 0) { + FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX; + } else +#endif + { + FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C; + } for (int y = 0; y < height; ++y) { FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; @@ -1141,14 +1215,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 2 == 0)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) + if (width % 2 == 0) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX; + } else +#endif + { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; + } for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + // MMX used for FastConvertYUVToARGBRow requires an emms instruction. EMMS(); return 0; } @@ -1165,14 +1258,31 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYUV444ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) + FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX; +#else + { + FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C; + } +#endif for (int y = 0; y < height; ++y) { - FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + // MMX used for FastConvertYUVToARGBRow requires an emms instruction. EMMS(); return 0; } @@ -1187,178 +1297,34 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + void (*FastConvertYToARGBRow)(const uint8* y_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + (width % 2 == 0) && + IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) { + FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; + } else +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX) + if (width % 2 == 0) { + FastConvertYToARGBRow = FastConvertYToARGBRow_MMX; + } else +#endif + { + FastConvertYToARGBRow = FastConvertYToARGBRow_C; + } for (int y = 0; y < height; ++y) { - FastConvertYToRGB32Row(src_y, dst_argb, width); + FastConvertYToARGBRow(src_y, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } - // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. + // MMX used for FastConvertYUVToARGBRow requires an emms instruction. EMMS(); return 0; } -// TODO(fbarchard): 64 bit version -#if defined(WIN32) && !defined(COVERAGE_ENABLED) - -#define HAS_I400TOARGBROW_SSE2 -__declspec(naked) -static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - __asm { - mov eax, [esp + 4] // src_y - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - - wloop: - movq xmm0, qword ptr [eax] - lea eax, [eax + 8] - punpcklbw xmm0, xmm0 - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - por xmm0, xmm7 - por xmm1, xmm7 - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - ja wloop - ret - } -} - -#define HAS_ABGRTOARGBROW_SSSE3 -__declspec(naked) -static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_abgr - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm7, _kShuffleMaskABGRToARGB - - convertloop : - movdqa xmm0, [eax] - lea eax, [eax + 16] - pshufb xmm0, xmm7 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - ja convertloop - ret - } -} - -#define HAS_BGRATOARGBROW_SSSE3 -__declspec(naked) -static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_bgra - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm7, _kShuffleMaskBGRAToARGB - - convertloop : - movdqa xmm0, [eax] - lea eax, [eax + 16] - pshufb xmm0, xmm7 - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - ja convertloop - ret - } -} - - -#elif (defined(__x86_64__) || defined(__i386__)) && \ - !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) - -// TODO(yuche): consider moving ARGB related codes to a separate file. -#define HAS_I400TOARGBROW_SSE2 -static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "pslld $0x18,%%xmm7\n" -"1:" - "movq (%0),%%xmm0\n" - "lea 0x8(%0),%0\n" - "punpcklbw %%xmm0,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "punpcklwd %%xmm0,%%xmm0\n" - "punpckhwd %%xmm1,%%xmm1\n" - "por %%xmm7,%%xmm0\n" - "por %%xmm7,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "movdqa %%xmm1,0x10(%1)\n" - "lea 0x20(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory" -); -} - -#define HAS_ABGRTOARGBROW_SSSE3 -static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" - : "+r"(src_abgr), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskABGRToARGB) // %3 - : "memory" -); -} - -#define HAS_BGRATOARGBROW_SSSE3 -static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, - int pix) { - asm volatile( - "movdqa (%3),%%xmm7\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm7,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" - : "+r"(src_bgra), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskBGRAToARGB) // %3 - : "memory" -); -} - -#endif - -static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { - // Copy a Y to RGB. - for (int x = 0; x < pix; ++x) { - uint8 y = src_y[0]; - dst_argb[2] = dst_argb[1] = dst_argb[0] = y; - dst_argb[3] = 255u; - dst_argb += 4; - ++src_y; - } -} - // Convert I400 to ARGB. int I400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, @@ -1370,7 +1336,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); #if defined(HAS_I400TOARGBROW_SSE2) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 8 == 0) && IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { @@ -1389,22 +1355,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, return 0; } -static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - // To support in-place conversion. - uint8 r = src_abgr[0]; - uint8 g = src_abgr[1]; - uint8 b = src_abgr[2]; - uint8 a = src_abgr[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_abgr += 4; - } -} - int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, uint8* dst_argb, int dst_stride_argb, int width, int height) { @@ -1415,7 +1365,7 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, } void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); #if defined(HAS_ABGRTOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 4 == 0) && IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { @@ -1434,22 +1384,6 @@ void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); return 0; } -static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - // To support in-place conversion. - uint8 a = src_bgra[0]; - uint8 r = src_bgra[1]; - uint8 g = src_bgra[2]; - uint8 b = src_bgra[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_bgra += 4; - } -} - // Convert BGRA to ARGB. int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, uint8* dst_argb, int dst_stride_argb, @@ -1461,7 +1395,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, } void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); #if defined(HAS_BGRATOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 4 == 0) && IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { @@ -1491,7 +1425,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); #if defined(HAS_ARGBTOYROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 4 == 0) && IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { @@ -1522,7 +1456,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, } void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); #if defined(HAS_RAWTOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { @@ -1552,7 +1486,7 @@ int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, } void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); #if defined(HAS_BG24TOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSSE3) && (width % 16 == 0) && IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { diff --git a/source/row.h b/source/row.h index 85343c563..958a833b1 100644 --- a/source/row.h +++ b/source/row.h @@ -13,9 +13,13 @@ #include "libyuv/basic_types.h" +#define kMaxStride (2048 * 4) + // The following are available on all x86 platforms -#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_ABGRTOARGBROW_SSSE3 +#define HAS_BGRATOARGBROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BG24TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 @@ -23,19 +27,41 @@ #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOUVROW_SSSE3 #define HAS_RAWTOUVROW_SSSE3 -#endif - -// The following are available only on Windows -#if defined(WIN32) \ - && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_BGRATOYROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 +#endif + +// The following are available on Windows and Linux +#if (defined(WIN32) || defined(__x86_64__) || \ + (defined(__i386__) && !defined(__pic__))) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_ARGBTOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #endif +// The following are available on Linux (32/64 bit) +// TODO(fbarchard): enable for fpic on linux +#if (defined(__x86_64__) || \ + (defined(__i386__) && !defined(__pic__))) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_FASTCONVERTYUVTOARGBROW_SSE2 +#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2 +#define HAS_FASTCONVERTYUVTOABGRROW_SSE2 +#endif + +// The following are available on Windows and GCC 32 bit +#if (defined(WIN32) || \ + defined(__i386__)) && \ + !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_FASTCONVERTYUVTOARGBROW_MMX +#define HAS_FASTCONVERTYUVTOBGRAROW_MMX +#define HAS_FASTCONVERTYUVTOABGRROW_MMX +#endif + extern "C" { + #ifdef HAS_ARGBTOYROW_SSSE3 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -75,56 +101,128 @@ void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); #ifdef HAS_BG24TOARGBROW_SSSE3 +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); #endif +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +#endif +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); + #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#else +#else // __GNUC__ #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define TALIGN16(t, var) t var __attribute__((aligned(16))) +typedef signed char __attribute__((vector_size(16))) vec8; +typedef unsigned char __attribute__((vector_size(16))) uvec8; #endif -#ifdef OSX -extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); -#else -extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); -#endif -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, +void FastConvertYUVToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 +void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, +void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width); +#endif + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX +void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width); +void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void FastConvertYToARGBRow_MMX(const uint8* y_buf, + uint8* rgb_buf, + int width); +#endif // Method to force C version. //#define USE_MMX 0 diff --git a/source/row_posix.cc b/source/row_posix.cc index 88ce475b4..090c1a635 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -15,62 +15,128 @@ extern "C" { #ifdef HAS_ARGBTOYROW_SSSE3 // Constant multiplication table for converting ARGB to I400. -extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = { - 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u +static const vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; -extern "C" TALIGN16(const uint8, kAdd16[16]) = { - 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u +static const uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, }; +#ifdef HAS_ARGBTOUVROW_SSSE3 +static const vec8 kARGBToU = { + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 +}; + +static const uvec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; +static const uvec8 kAddUV128 = { + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u +}; +#endif + // Shuffle table for converting BG24 to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { +static const uvec8 kShuffleMaskBG24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { +static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting BGRA to ARGB. +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile( - "movdqa (%3),%%xmm7\n" - "movdqa (%4),%%xmm6\n" - "movdqa %%xmm6,%%xmm5\n" - "psllw $0x4,%%xmm5\n" // Generate a mask of 0x10 on each byte. + "pcmpeqb %%xmm5,%%xmm5\n" + "pslld $0x18,%%xmm5\n" "1:" - "movdqa (%0),%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "psrlw $0x7,%%xmm0\n" - "pmaddubsw %%xmm7,%%xmm1\n" - "lea 0x20(%0),%0\n" - "psrlw $0x7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "paddb %%xmm5,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" + "movq (%0),%%xmm0\n" + "lea 0x8(%0),%0\n" + "punpcklbw %%xmm0,%%xmm0\n" + "movdqa %%xmm0,%%xmm1\n" + "punpcklwd %%xmm0,%%xmm0\n" + "punpckhwd %%xmm1,%%xmm1\n" + "por %%xmm5,%%xmm0\n" + "por %%xmm5,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "movdqa %%xmm1,0x10(%1)\n" + "lea 0x20(%1),%1\n" "sub $0x8,%2\n" "ja 1b\n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 - : "r"(kMultiplyMaskARGBToI400), // %3 - "r"(kAdd16) // %4 - : "memory" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif ); } + +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { + asm volatile( + "movdqa %3,%%xmm5\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm5,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_abgr), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskABGRToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" #endif -#ifdef HAS_BG24TOARGBROW_SSSE3 +); +} + +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { + asm volatile( + "movdqa %3,%%xmm5\n" +"1:" + "movdqa (%0),%%xmm0\n" + "lea 0x10(%0),%0\n" + "pshufb %%xmm5,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x4,%2\n" + "ja 1b\n" + : "+r"(src_bgra), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "m"(kShuffleMaskBGRAToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif +); +} + void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" + "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5\n" + "movdqa %3,%%xmm4\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" @@ -78,19 +144,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { "lea 0x30(%0),%0\n" "movdqa %%xmm3,%%xmm2\n" "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" + "pshufb %%xmm4,%%xmm2\n" + "por %%xmm5,%%xmm2\n" "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" + "pshufb %%xmm4,%%xmm0\n" "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" + "por %%xmm5,%%xmm0\n" + "pshufb %%xmm4,%%xmm1\n" "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" + "por %%xmm5,%%xmm1\n" "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" + "pshufb %%xmm4,%%xmm3\n" "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" + "por %%xmm5,%%xmm3\n" "movdqa %%xmm3,0x30(%1)\n" "lea 0x40(%1),%1\n" "sub $0x10,%2\n" @@ -98,16 +164,19 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { : "+r"(src_bg24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 - : "r"(kShuffleMaskBG24ToARGB) // %3 - : "memory" + : "m"(kShuffleMaskBG24ToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif ); } void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" + "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5\n" + "movdqa %3,%%xmm4\n" "1:" "movdqa (%0),%%xmm0\n" "movdqa 0x10(%0),%%xmm1\n" @@ -115,19 +184,19 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "lea 0x30(%0),%0\n" "movdqa %%xmm3,%%xmm2\n" "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" + "pshufb %%xmm4,%%xmm2\n" + "por %%xmm5,%%xmm2\n" "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" + "pshufb %%xmm4,%%xmm0\n" "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" + "por %%xmm5,%%xmm0\n" + "pshufb %%xmm4,%%xmm1\n" "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" + "por %%xmm5,%%xmm1\n" "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" + "pshufb %%xmm4,%%xmm3\n" "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" + "por %%xmm5,%%xmm3\n" "movdqa %%xmm3,0x30(%1)\n" "lea 0x40(%1),%1\n" "sub $0x10,%2\n" @@ -135,147 +204,320 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 - : "r"(kShuffleMaskRAWToARGB) // %3 - : "memory" + : "m"(kShuffleMaskRAWToARGB) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif +); +} + +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile( + "movdqa %4,%%xmm5\n" + "movdqa %3,%%xmm4\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm2\n" + "movdqa 0x30(%0),%%xmm3\n" + "pmaddubsw %%xmm4,%%xmm0\n" + "pmaddubsw %%xmm4,%%xmm1\n" + "pmaddubsw %%xmm4,%%xmm2\n" + "pmaddubsw %%xmm4,%%xmm3\n" + "lea 0x40(%0),%0\n" + "phaddw %%xmm1,%%xmm0\n" + "phaddw %%xmm3,%%xmm2\n" + "psrlw $0x7,%%xmm0\n" + "psrlw $0x7,%%xmm2\n" + "packuswb %%xmm2,%%xmm0\n" + "paddb %%xmm5,%%xmm0\n" + "movdqa %%xmm0,(%1)\n" + "lea 0x10(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } #endif +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile( + "movdqa %5,%%xmm7\n" + "movdqa %6,%%xmm6\n" + "movdqa %7,%%xmm5\n" + "sub %1,%2\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm2\n" + "movdqa 0x30(%0),%%xmm3\n" + "pavgb (%0,%4,1),%%xmm0\n" + "pavgb 0x10(%0,%4,1),%%xmm1\n" + "pavgb 0x20(%0,%4,1),%%xmm2\n" + "pavgb 0x30(%0,%4,1),%%xmm3\n" + "lea 0x40(%0),%0\n" + "movdqa %%xmm0,%%xmm4\n" + "shufps $0x88,%%xmm1,%%xmm0\n" + "shufps $0xdd,%%xmm1,%%xmm4\n" + "pavgb %%xmm4,%%xmm0\n" + "movdqa %%xmm2,%%xmm4\n" + "shufps $0x88,%%xmm3,%%xmm2\n" + "shufps $0xdd,%%xmm3,%%xmm4\n" + "pavgb %%xmm4,%%xmm2\n" + "movdqa %%xmm0,%%xmm1\n" + "movdqa %%xmm2,%%xmm3\n" + "pmaddubsw %%xmm7,%%xmm0\n" + "pmaddubsw %%xmm7,%%xmm2\n" + "pmaddubsw %%xmm6,%%xmm1\n" + "pmaddubsw %%xmm6,%%xmm3\n" + "phaddw %%xmm2,%%xmm0\n" + "phaddw %%xmm3,%%xmm1\n" + "psraw $0x8,%%xmm0\n" + "psraw $0x8,%%xmm1\n" + "packsswb %%xmm1,%%xmm0\n" + "paddb %%xmm5,%%xmm0\n" + "movlps %%xmm0,(%1)\n" + "movhps %%xmm0,(%1,%2,1)\n" + "lea 0x8(%1),%1\n" + "sub $0x10,%3\n" + "ja 1b\n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_argb)), // %4 + "m"(kARGBToU), // %5 + "m"(kARGBToV), // %6 + "m"(kAddUV128) // %7 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif +); +} +#endif + +// The following code requires 6 registers and prefers 7 registers. +// 7 registers requires -fpic to be off, and -fomit-frame-pointer +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 #if defined(__x86_64__) +#define REG_a "rax" +#define REG_d "rdx" +#else +#define REG_a "eax" +#define REG_d "edx" +#endif +#if defined(__APPLE__) || defined(__x86_64__) +#define OMITFP +#else +#define OMITFP __attribute__((optimize("omit-frame-pointer"))) +#endif -// 64 bit linux gcc version +#if defined(__APPLE__) +// REG6 version uses 1 less register but is slower +#define REG6 +#endif -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" +#ifdef REG6 +// 6 register version only has REG_a for temporary +#define CLOBBER "%"REG_a +#define YUVTORGB \ + "1:" \ + "movzb (%1),%%"REG_a"\n" \ + "lea 1(%1),%1\n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ + "movzb (%2),%%"REG_a"\n" \ + "lea 1(%2),%2\n" \ + "movq 4096(%5,%%"REG_a",8),%%xmm1\n" \ + "paddsw %%xmm1,%%xmm0\n" \ + "movzb (%0),%%"REG_a"\n" \ + "movq 0(%5,%%"REG_a",8),%%xmm2\n" \ + "movzb 0x1(%0),%%"REG_a"\n" \ + "movq 0(%5,%%"REG_a",8),%%xmm3\n" \ + "lea 2(%0),%0\n" \ + "paddsw %%xmm0,%%xmm2\n" \ + "paddsw %%xmm0,%%xmm3\n" \ + "shufps $0x44,%%xmm3,%%xmm2\n" \ + "psraw $0x6,%%xmm2\n" \ + "packuswb %%xmm2,%%xmm2\n" \ + "movq %%xmm2,0x0(%3)\n" \ + "lea 8(%3),%3\n" \ + "sub $0x2,%4\n" \ "ja 1b\n" +#else +#define CLOBBER "%"REG_a, "%"REG_d +// This version produces 2 pixels +#define YUVTORGB \ +"1:" \ + "movzb (%1),%%"REG_a"\n" \ + "lea 1(%1),%1\n" \ + "movzb (%2),%%"REG_d"\n" \ + "lea 1(%2),%2\n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ + "movzb 0(%0),%%"REG_a"\n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ + "paddsw %%xmm1,%%xmm0\n" \ + "movzb 1(%0),%%"REG_d"\n" \ + "punpcklqdq %%xmm0,%%xmm0\n" \ + "lea 2(%0),%0\n" \ + "movq 0(%5,%%"REG_a",8),%%xmm1\n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm1\n" \ + "paddsw %%xmm0,%%xmm1\n" \ + "psraw $6,%%xmm1\n" \ + "packuswb %%xmm1,%%xmm1\n" \ + "movq %%xmm1,0(%3)\n" \ + "lea 8(%3),%3\n" \ + "sub $0x2,%4\n" \ + "ja 1b\n" +// This version produces 4 pixels +#define YUVTORGB4 \ +"1:" \ + "movzb 0(%1),%%"REG_a"\n" \ + "movzb 0(%2),%%"REG_d"\n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ + "movzb 0(%0),%%"REG_a"\n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ + "paddsw %%xmm1,%%xmm0\n" \ + "movzb 1(%0),%%"REG_d"\n" \ + "punpcklqdq %%xmm0,%%xmm0\n" \ + "movq 0(%5,%%"REG_a",8),%%xmm2\n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm2\n" \ + "paddsw %%xmm0,%%xmm2\n" \ + "psraw $6,%%xmm2\n" \ + "movzb 1(%1),%%"REG_a"\n" \ + "movzb 1(%2),%%"REG_d"\n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ + "movzb 2(%0),%%"REG_a"\n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ + "paddsw %%xmm1,%%xmm0\n" \ + "movzb 3(%0),%%"REG_d"\n" \ + "punpcklqdq %%xmm0,%%xmm0\n" \ + "movq 0(%5,%%"REG_a",8),%%xmm3\n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm3\n" \ + "paddsw %%xmm0,%%xmm3\n" \ + "psraw $6,%%xmm3\n" \ + "lea 2(%1),%1\n" \ + "lea 2(%2),%2\n" \ + "lea 4(%0),%0\n" \ + "packuswb %%xmm3,%%xmm2\n" \ + "movdqa %%xmm2,0(%3)\n" \ + "lea 16(%3),%3\n" \ + "sub $0x4,%4\n" \ + "ja 1b\n" +#endif + +// 6 or 7 registers +void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( + YUVTORGB : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsRgbY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + "+rm"(width) // %4 + : "r" (kCoefficientsRgbY) // %5 + : "memory", "cc", CLOBBER +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif ); } -void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 +// 6 or 7 registers +void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" - "ja 1b\n" + YUVTORGB4 : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsBgraY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + "+rm"(width) // %4 + : "r" (kCoefficientsRgbY) // %5 + : "memory", "cc", CLOBBER +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif ); } -void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 +void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile( -"1:" - "movzb (%1),%%r10\n" - "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" - "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" - "movzb 0x1(%0),%%r11\n" - "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" - "lea 2(%0),%0\n" - "movq (%5,%%r11,8),%%xmm3\n" - "paddsw %%xmm0,%%xmm2\n" - "paddsw %%xmm0,%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%3)\n" - "lea 8(%3),%3\n" - "sub $0x2,%4\n" - "ja 1b\n" + YUVTORGB : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsAbgrY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + "+rm"(width) // %4 + : "r" (kCoefficientsBgraY) // %5 + : "memory", "cc", CLOBBER +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif ); } -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 +void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile( + YUVTORGB + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r" (kCoefficientsAbgrY) // %5 + : "memory", "cc", CLOBBER +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif +); +} + +// 6 registers +void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile( "1:" - "movzb (%1),%%r10\n" + "movzb (%1),%%"REG_a"\n" "lea 1(%1),%1\n" - "movzb (%2),%%r11\n" + "movq 2048(%5,%%"REG_a",8),%%xmm0\n" + "movzb (%2),%%"REG_a"\n" "lea 1(%2),%2\n" - "movq 2048(%5,%%r10,8),%%xmm0\n" - "movzb (%0),%%r10\n" - "movq 4096(%5,%%r11,8),%%xmm1\n" + "movq 4096(%5,%%"REG_a",8),%%xmm1\n" "paddsw %%xmm1,%%xmm0\n" - "movq (%5,%%r10,8),%%xmm2\n" + "movzb (%0),%%"REG_a"\n" "lea 1(%0),%0\n" + "movq 0(%5,%%"REG_a",8),%%xmm2\n" "paddsw %%xmm0,%%xmm2\n" "shufps $0x44,%%xmm2,%%xmm2\n" "psraw $0x6,%%xmm2\n" @@ -288,23 +530,26 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r" (_kCoefficientsRgbY) // %5 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" + "+rm"(width) // %4 + : "r" (kCoefficientsRgbY) // %5 + : "memory", "cc", "%"REG_a +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif ); } -void FastConvertYToRGB32Row(const uint8* y_buf, // rdi - uint8* rgb_buf, // rcx - int width) { // r8 +// 5 registers +void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile( "1:" - "movzb (%0),%%r10\n" - "movzb 0x1(%0),%%r11\n" - "movq (%3,%%r10,8),%%xmm2\n" + "movzb (%0),%%"REG_a"\n" + "movzb 0x1(%0),%%"REG_d"\n" + "movq (%3,%%"REG_a",8),%%xmm2\n" "lea 2(%0),%0\n" - "movq (%3,%%r11,8),%%xmm3\n" - "shufps $0x44,%%xmm3,%%xmm2\n" + "movhps (%3,%%"REG_d",8),%%xmm2\n" "psraw $0x6,%%xmm2\n" "packuswb %%xmm2,%%xmm2\n" "movq %%xmm2,0x0(%1)\n" @@ -313,154 +558,27 @@ void FastConvertYToRGB32Row(const uint8* y_buf, // rdi "ja 1b\n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 - "+r"(width) // %2 - : "r" (_kCoefficientsRgbY) // %3 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" + "+rm"(width) // %2 + : "r" (kCoefficientsRgbY) // %3 + : "memory", "cc", "%"REG_a, "%"REG_d +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif ); } -#elif defined(__i386__) -// 32 bit gcc version - -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToRGB32Row\n" -"_FastConvertYUVToRGB32Row:\n" -#else - ".global FastConvertYUVToRGB32Row\n" -"FastConvertYUVToRGB32Row:\n" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsRgbY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); +#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX +// 32 bit mmx gcc version -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToBGRARow\n" -"_FastConvertYUVToBGRARow:\n" +#ifdef OSX +#define UNDERSCORE "_" #else - ".global FastConvertYUVToBGRARow\n" -"FastConvertYUVToBGRARow:\n" +#define UNDERSCORE "" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsBgraY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsBgraY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsBgraY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - asm( - ".text\n" -#if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToABGRRow\n" -"_FastConvertYUVToABGRRow:\n" -#else - ".global FastConvertYUVToABGRRow\n" -"FastConvertYUVToABGRRow:\n" -#endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" - -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" - "movq _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" - "movq _kCoefficientsAbgrY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" - "movq _kCoefficientsAbgrY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movntq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" -); - -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, +void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -468,11 +586,11 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, asm( ".text\n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYUV444ToRGB32Row\n" -"_FastConvertYUV444ToRGB32Row:\n" + ".globl _FastConvertYUVToARGBRow_MMX\n" +"_FastConvertYUVToARGBRow_MMX:\n" #else - ".global FastConvertYUV444ToRGB32Row\n" -"FastConvertYUV444ToRGB32Row:\n" + ".global FastConvertYUVToARGBRow_MMX\n" +"FastConvertYUVToARGBRow_MMX:\n" #endif "pusha\n" "mov 0x24(%esp),%edx\n" @@ -486,11 +604,149 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, "lea 1(%edi),%edi\n" "movzbl (%esi),%ebx\n" "lea 1(%esi),%esi\n" - "movq _kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" "movzbl (%edx),%eax\n" - "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToBGRARow_MMX\n" +"_FastConvertYUVToBGRARow_MMX:\n" +#else + ".global FastConvertYUVToBGRARow_MMX\n" +"FastConvertYUVToBGRARow_MMX:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUVToABGRRow_MMX\n" +"_FastConvertYUVToABGRRow_MMX:\n" +#else + ".global FastConvertYUVToABGRRow_MMX\n" +"FastConvertYUVToABGRRow_MMX:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" + "movzbl 0x1(%edx),%ebx\n" + "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n" + "lea 2(%edx),%edx\n" + "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n" + "paddsw %mm0,%mm1\n" + "paddsw %mm0,%mm2\n" + "psraw $0x6,%mm1\n" + "psraw $0x6,%mm2\n" + "packuswb %mm2,%mm1\n" + "movq %mm1,0x0(%ebp)\n" + "lea 8(%ebp),%ebp\n" + "sub $0x2,%ecx\n" + "ja 1b\n" + "popa\n" + "ret\n" +); + +void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + asm( + ".text\n" +#if defined(OSX) || defined(IOS) + ".globl _FastConvertYUV444ToARGBRow_MMX\n" +"_FastConvertYUV444ToARGBRow_MMX:\n" +#else + ".global FastConvertYUV444ToARGBRow_MMX\n" +"FastConvertYUV444ToARGBRow_MMX:\n" +#endif + "pusha\n" + "mov 0x24(%esp),%edx\n" + "mov 0x28(%esp),%edi\n" + "mov 0x2c(%esp),%esi\n" + "mov 0x30(%esp),%ebp\n" + "mov 0x34(%esp),%ecx\n" + +"1:" + "movzbl (%edi),%eax\n" + "lea 1(%edi),%edi\n" + "movzbl (%esi),%ebx\n" + "lea 1(%esi),%esi\n" + "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" + "movzbl (%edx),%eax\n" + "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" "lea 1(%edx),%edx\n" - "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n" + "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n" "psraw $0x6,%mm0\n" "packuswb %mm0,%mm0\n" "movd %mm0,0x0(%ebp)\n" @@ -501,17 +757,17 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, "ret\n" ); -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width); +void FastConvertYToARGBRow_MMX(const uint8* y_buf, + uint8* rgb_buf, + int width); asm( ".text\n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYToRGB32Row\n" -"_FastConvertYToRGB32Row:\n" + ".globl _FastConvertYToARGBRow_MMX\n" +"_FastConvertYToARGBRow_MMX:\n" #else - ".global FastConvertYToRGB32Row\n" -"FastConvertYToRGB32Row:\n" + ".global FastConvertYToARGBRow_MMX\n" +"FastConvertYToARGBRow_MMX:\n" #endif "push %ebx\n" "mov 0x8(%esp),%eax\n" @@ -520,10 +776,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf, "1:" "movzbl (%eax),%ebx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm0\n" + "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n" "psraw $0x6,%mm0\n" "movzbl 0x1(%eax),%ebx\n" - "movq _kCoefficientsRgbY(,%ebx,8),%mm1\n" + "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n" "psraw $0x6,%mm1\n" "packuswb %mm1,%mm0\n" "lea 0x2(%eax),%eax\n" @@ -535,125 +791,36 @@ void FastConvertYToRGB32Row(const uint8* y_buf, "ret\n" ); -#else -// C reference code that mimic the YUV assembly. -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) +#endif -static inline void YuvPixel(uint8 y, - uint8 u, - uint8 v, - uint8* rgb_buf, - int ashift, - int rshift, - int gshift, - int bshift) { - - int b = _kCoefficientsRgbY[256+u][0]; - int g = _kCoefficientsRgbY[256+u][1]; - int r = _kCoefficientsRgbY[256+u][2]; - int a = _kCoefficientsRgbY[256+u][3]; - - b = paddsw(b, _kCoefficientsRgbY[512+v][0]); - g = paddsw(g, _kCoefficientsRgbY[512+v][1]); - r = paddsw(r, _kCoefficientsRgbY[512+v][2]); - a = paddsw(a, _kCoefficientsRgbY[512+v][3]); - - b = paddsw(b, _kCoefficientsRgbY[y][0]); - g = paddsw(g, _kCoefficientsRgbY[y][1]); - r = paddsw(r, _kCoefficientsRgbY[y][2]); - a = paddsw(a, _kCoefficientsRgbY[y][3]); - - b >>= 6; - g >>= 6; - r >>= 6; - a >>= 6; - - *reinterpret_cast(rgb_buf) = (packuswb(b) << bshift) | - (packuswb(g) << gshift) | - (packuswb(r) << rshift) | - (packuswb(a) << ashift); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + ABGRToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); } -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0); - } - rgb_buf += 8; // Advance 2 pixels. - } +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BGRAToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); } -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24); - } - rgb_buf += 8; // Advance 2 pixels. - } +#ifdef HAS_ARGBTOUVROW_SSSE3 +void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + ABGRToARGBRow_SSSE3(src_argb, row, pix); + ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); } -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; x += 2) { - uint8 u = u_buf[x >> 1]; - uint8 v = v_buf[x >> 1]; - uint8 y0 = y_buf[x]; - YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16); - if ((x + 1) < width) { - uint8 y1 = y_buf[x + 1]; - YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16); - } - rgb_buf += 8; // Advance 2 pixels. - } +void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BGRAToARGBRow_SSSE3(src_argb, row, pix); + BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); } - -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; ++x) { - uint8 u = u_buf[x]; - uint8 v = v_buf[x]; - uint8 y = y_buf[x]; - YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0); - rgb_buf += 4; // Advance 1 pixel. - } -} - -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; ++x) { - uint8 y = y_buf[x]; - YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0); - rgb_buf += 4; // Advance 1 pixel. - } -} - #endif } // extern "C" diff --git a/source/row_table.cc b/source/row_table.cc index 022d9f88c..d9c21d6dc 100644 --- a/source/row_table.cc +++ b/source/row_table.cc @@ -10,8 +10,6 @@ #include "row.h" -#define kMaxStride (2048 * 4) - extern "C" { #define MAKETABLE(NAME) \ @@ -232,11 +230,7 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\ 0 \ } -#ifdef OSX MAKETABLE(kCoefficientsRgbY) -#else -MAKETABLE(_kCoefficientsRgbY) -#endif #undef RGBY #undef RGBU @@ -264,12 +258,7 @@ MAKETABLE(_kCoefficientsRgbY) 0 \ } -#ifdef OSX MAKETABLE(kCoefficientsBgraY) -#else -MAKETABLE(_kCoefficientsBgraY) -#endif - #undef RGBY #undef RGBU @@ -297,12 +286,39 @@ MAKETABLE(_kCoefficientsBgraY) 0 \ } -#ifdef OSX MAKETABLE(kCoefficientsAbgrY) -#else -MAKETABLE(_kCoefficientsAbgrY) -#endif +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 r = src_abgr[0]; + uint8 g = src_abgr[1]; + uint8 b = src_abgr[2]; + uint8 a = src_abgr[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_abgr += 4; + } +} + +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 a = src_bgra[0]; + uint8 r = src_bgra[1]; + uint8 g = src_bgra[2]; + uint8 b = src_bgra[3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + dst_argb += 4; + src_bgra += 4; + } +} void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { for (int x = 0; x < pix; ++x) { @@ -466,4 +482,133 @@ void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, #endif #endif +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { + // Copy a Y to RGB. + for (int x = 0; x < pix; ++x) { + uint8 y = src_y[0]; + dst_argb[2] = dst_argb[1] = dst_argb[0] = y; + dst_argb[3] = 255u; + dst_argb += 4; + ++src_y; + } +} + +// C reference code that mimic the YUV assembly. +#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) +#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) + +static inline void YuvPixel(uint8 y, + uint8 u, + uint8 v, + uint8* rgb_buf, + int ashift, + int rshift, + int gshift, + int bshift) { + + int b = kCoefficientsRgbY[256+u][0]; + int g = kCoefficientsRgbY[256+u][1]; + int r = kCoefficientsRgbY[256+u][2]; + int a = kCoefficientsRgbY[256+u][3]; + + b = paddsw(b, kCoefficientsRgbY[512+v][0]); + g = paddsw(g, kCoefficientsRgbY[512+v][1]); + r = paddsw(r, kCoefficientsRgbY[512+v][2]); + a = paddsw(a, kCoefficientsRgbY[512+v][3]); + + b = paddsw(b, kCoefficientsRgbY[y][0]); + g = paddsw(g, kCoefficientsRgbY[y][1]); + r = paddsw(r, kCoefficientsRgbY[y][2]); + a = paddsw(a, kCoefficientsRgbY[y][3]); + + b >>= 6; + g >>= 6; + r >>= 6; + a >>= 6; + + *reinterpret_cast(rgb_buf) = (packuswb(b) << bshift) | + (packuswb(g) << gshift) | + (packuswb(r) << rshift) | + (packuswb(a) << ashift); +} + +void FastConvertYUVToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void FastConvertYUVToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24); + } +} + +void FastConvertYUVToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16); + y_buf += 2; + u_buf += 1; + v_buf += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16); + } +} + +void FastConvertYUV444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0); + y_buf += 1; + u_buf += 1; + v_buf += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +void FastConvertYToARGBRow_C(const uint8* y_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); + y_buf += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + } // extern "C" diff --git a/source/row_win.cc b/source/row_win.cc index 2bc5fb136..27d2d0b93 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -74,6 +74,160 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; +// Shuffle table for converting ABGR to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting BGRA to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +__declspec(naked) +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + wloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + ja wloop + ret + } +} + +__declspec(naked) +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_abgr + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm5, _kShuffleMaskABGRToARGB + + convertloop : + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm5 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + +__declspec(naked) +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_bgra + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + movdqa xmm5, _kShuffleMaskBGRAToARGB + + convertloop : + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm5 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 4 + ja convertloop + ret + } +} + +__declspec(naked) +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_bg24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, _kShuffleMaskBG24ToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + movdqa xmm4, _kShuffleMaskRAWToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm4 + por xmm2, xmm5 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm4 + movdqa [edx + 32], xmm2 + por xmm0, xmm5 + pshufb xmm1, xmm4 + movdqa [edx], xmm0 + por xmm1, xmm5 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm4 + movdqa [edx + 16], xmm1 + por xmm3, xmm5 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + // Convert 16 ARGB pixels (64 bytes) to 16 Y values __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { @@ -81,25 +235,25 @@ __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kARGBToY - movdqa xmm6, _kAddY16 + movdqa xmm5, _kAddY16 + movdqa xmm4, _kARGBToY convertloop : movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -114,25 +268,25 @@ __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kBGRAToY - movdqa xmm6, _kAddY16 + movdqa xmm5, _kAddY16 + movdqa xmm4, _kBGRAToY convertloop : movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -147,25 +301,25 @@ __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm7, _kABGRToY - movdqa xmm6, _kAddY16 + movdqa xmm5, _kAddY16 + movdqa xmm4, _kABGRToY convertloop : movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + 32] movdqa xmm3, [eax + 48] - pmaddubsw xmm0, xmm7 - pmaddubsw xmm1, xmm7 - pmaddubsw xmm2, xmm7 - pmaddubsw xmm3, xmm7 + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 - paddb xmm0, xmm6 + paddb xmm0, xmm5 movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -366,230 +520,138 @@ __asm { } } -__declspec(naked) -void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { -__asm { - mov eax, [esp + 4] // src_bg24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskBG24ToARGB - - convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 - movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - ja convertloop - ret +#define YUVTORGB(TABLE) __asm { \ + __asm convertloop : \ + __asm movzx eax, byte ptr [edi] \ + __asm lea edi, [edi + 1] \ + __asm movzx ebx, byte ptr [esi] \ + __asm lea esi, [esi + 1] \ + __asm movq mm0, [TABLE + 2048 + 8 * eax] \ + __asm movzx eax, byte ptr [edx] \ + __asm paddsw mm0, [TABLE + 4096 + 8 * ebx] \ + __asm movzx ebx, byte ptr [edx + 1] \ + __asm movq mm1, [TABLE + 8 * eax] \ + __asm lea edx, [edx + 2] \ + __asm movq mm2, [TABLE + 8 * ebx] \ + __asm paddsw mm1, mm0 \ + __asm paddsw mm2, mm0 \ + __asm psraw mm1, 6 \ + __asm psraw mm2, 6 \ + __asm packuswb mm1, mm2 \ + __asm movq [ebp], mm1 \ + __asm lea ebp, [ebp + 8] \ + __asm sub ecx, 2 \ + __asm ja convertloop \ } -} __declspec(naked) -void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskRAWToARGB - - convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 - movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - ja convertloop - ret - } -} - -__declspec(naked) -void FastConvertYUVToRGB32Row(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsRgbY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsRgbY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUVToBGRARow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsBgraY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsBgraY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUVToABGRRow(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - pushad - mov edx, [esp + 32 + 4] - mov edi, [esp + 32 + 8] - mov esi, [esp + 32 + 12] - mov ebp, [esp + 32 + 16] - mov ecx, [esp + 32 + 20] - - convertloop : - movzx eax, byte ptr [edi] - lea edi, [edi + 1] - movzx ebx, byte ptr [esi] - lea esi, [esi + 1] - movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax] - movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx] - movzx ebx, byte ptr [edx + 1] - movq mm1, [_kCoefficientsAbgrY + 8 * eax] - lea edx, [edx + 2] - movq mm2, [_kCoefficientsAbgrY + 8 * ebx] - paddsw mm1, mm0 - paddsw mm2, mm0 - psraw mm1, 6 - psraw mm2, 6 - packuswb mm1, mm2 - movntq [ebp], mm1 - lea ebp, [ebp + 8] - sub ecx, 2 - ja convertloop - - popad - ret - } -} - -__declspec(naked) -void FastConvertYUV444ToRGB32Row(const uint8* y_buf, +void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width) { __asm { - pushad - mov edx, [esp + 32 + 4] // Y - mov edi, [esp + 32 + 8] // U - mov esi, [esp + 32 + 12] // V - mov ebp, [esp + 32 + 16] // rgb - mov ecx, [esp + 32 + 20] // width + push ebx + push esi + push edi + push ebp + mov edx, [esp + 16 + 4] + mov edi, [esp + 16 + 8] + mov esi, [esp + 16 + 12] + mov ebp, [esp + 16 + 16] + mov ecx, [esp + 16 + 20] + + YUVTORGB(kCoefficientsRgbY) + + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +__declspec(naked) +void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push ebx + push esi + push edi + push ebp + mov edx, [esp + 16 + 4] + mov edi, [esp + 16 + 8] + mov esi, [esp + 16 + 12] + mov ebp, [esp + 16 + 16] + mov ecx, [esp + 16 + 20] + + YUVTORGB(kCoefficientsBgraY) + + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +__declspec(naked) +void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push ebx + push esi + push edi + push ebp + mov edx, [esp + 16 + 4] + mov edi, [esp + 16 + 8] + mov esi, [esp + 16 + 12] + mov ebp, [esp + 16 + 16] + mov ecx, [esp + 16 + 20] + + YUVTORGB(kCoefficientsAbgrY) + + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +__declspec(naked) +void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push ebx + push esi + push edi + push ebp + mov edx, [esp + 16 + 4] + mov edi, [esp + 16 + 8] + mov esi, [esp + 16 + 12] + mov ebp, [esp + 16 + 16] + mov ecx, [esp + 16 + 20] convertloop : movzx eax, byte ptr [edi] lea edi, [edi + 1] movzx ebx, byte ptr [esi] lea esi, [esi + 1] - movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] + movq mm0, [kCoefficientsRgbY + 2048 + 8 * eax] movzx eax, byte ptr [edx] - paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] + paddsw mm0, [kCoefficientsRgbY + 4096 + 8 * ebx] lea edx, [edx + 1] - paddsw mm0, [_kCoefficientsRgbY + 8 * eax] + paddsw mm0, [kCoefficientsRgbY + 8 * eax] psraw mm0, 6 packuswb mm0, mm0 movd [ebp], mm0 @@ -597,15 +659,18 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, sub ecx, 1 ja convertloop - popad + pop ebp + pop edi + pop esi + pop ebx ret } } __declspec(naked) -void FastConvertYToRGB32Row(const uint8* y_buf, - uint8* rgb_buf, - int width) { +void FastConvertYToARGBRow_MMX(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { push ebx mov eax, [esp + 4 + 4] // Y @@ -614,10 +679,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf, convertloop : movzx ebx, byte ptr [eax] - movq mm0, [_kCoefficientsRgbY + 8 * ebx] + movq mm0, [kCoefficientsRgbY + 8 * ebx] psraw mm0, 6 movzx ebx, byte ptr [eax + 1] - movq mm1, [_kCoefficientsRgbY + 8 * ebx] + movq mm1, [kCoefficientsRgbY + 8 * ebx] psraw mm1, 6 packuswb mm0, mm1 lea eax, [eax + 2] diff --git a/source/video_common.h b/source/video_common.h index 9fe08a03a..8d7d13287 100644 --- a/source/video_common.h +++ b/source/video_common.h @@ -42,6 +42,7 @@ enum FourCC { FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_M420 = FOURCC('M', '4', '2', '0'), + FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),