From 9394ed99fcc9802a068ba4a44c36aed79ce87157 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 31 Oct 2011 21:36:47 +0000 Subject: [PATCH] ARGB To I420 and variations using row functions BUG=none TEST=media_unittests from talk used to benchmark Review URL: http://webrtc-codereview.appspot.com/254001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/cpu_id.h | 3 + source/convert.cc | 405 ++++++++++++++++--------------- source/cpu_id.cc | 7 +- source/format_conversion.cc | 31 ++- source/planar_functions.cc | 321 +++++-------------------- source/rotate.cc | 154 +++++++++++- source/row.h | 102 +++++--- source/row_posix.cc | 121 +++++++--- source/row_table.cc | 165 +++++++++++++ source/row_win.cc | 465 ++++++++++++++++++++++++++++-------- 10 files changed, 1142 insertions(+), 632 deletions(-) diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 0e1ab48fc..c1000e867 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2; // These flags are only valid on ARM processors static const int kCpuHasNEON = 4; +// Internal flag to indicate cpuid is initialized. +static const int kCpuInitialized = 8; + // Detect CPU has SSE2 etc. bool TestCpuFlag(int flag); diff --git a/source/convert.cc b/source/convert.cc index ee7af0cca..8154dcb78 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -636,185 +636,6 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame, return 0; } -// ARGBToI420Row_C etc row functions use the following macro, generating -// code with RGB offsets/strides different for each version. Less error -// prone than duplicating the code. -// template could be used, but macro method works for C and asm and this is -// performance critical code. - -#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \ -static void \ -NAME(const uint8* src_row0, const uint8* src_row1, \ - uint8* dst_yplane0, uint8* dst_yplane1, \ - uint8* dst_u, \ - uint8* dst_v, \ - int width) { \ - for (int x = 0; x < width - 1; x += 2) { \ - dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \ - src_row0[G] * 129 + \ - src_row0[B] * 25 + 128) >> 8) + 16; \ - dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \ - src_row0[G + BPP] * 129 + \ - src_row0[B + BPP] * 25 + 128) >> 8) + 16; \ - dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \ - src_row1[G] * 129 + \ - src_row1[B] * 25 + 128) >> 8) + 16; \ - dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \ - src_row1[G + BPP] * 129 + \ - src_row1[B + BPP] * 25 + 128) >> 8) + 16; \ - dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ - src_row1[R] + src_row1[R + BPP]) * -38 + \ - (src_row0[G] + src_row0[G + BPP] + \ - src_row1[G] + src_row1[G + BPP]) * -74 + \ - (src_row0[B] + src_row0[B + BPP] + \ - src_row1[B] + src_row1[B + BPP]) * 112 + \ - + 512) >> 10) + 128; \ - dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \ - src_row1[R] + src_row1[R + BPP]) * 112 + \ - (src_row0[G] + src_row0[G + BPP] + \ - src_row1[G] + src_row1[G + BPP]) * -94 + \ - (src_row0[B] + src_row0[B + BPP] + \ - src_row1[B] + src_row1[B + BPP]) * -18 + \ - + 512) >> 10) + 128; \ - dst_yplane0 += 2; \ - dst_yplane1 += 2; \ - ++dst_u; \ - ++dst_v; \ - src_row0 += BPP * 2; \ - src_row1 += BPP * 2; \ - } \ - if (width & 1) { \ - dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \ - src_row0[G] * 129 + \ - src_row0[B] * 25 + 128) >> 8) + 16; \ - dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \ - src_row1[G] * 129 + \ - src_row1[B] * 25 + 128) >> 8) + 16; \ - dst_u[0] = (uint8)(((src_row0[R] + \ - src_row1[R]) * -38 + \ - (src_row0[G] + \ - src_row1[G]) * -74 + \ - (src_row0[B] + \ - src_row1[B]) * 112 + \ - + 256) >> 9) + 128; \ - dst_v[0] = (uint8)(((src_row0[R] + \ - src_row1[R]) * 112 + \ - (src_row0[G] + \ - src_row1[G]) * -94 + \ - (src_row0[B] + \ - src_row1[B]) * -18 + \ - + 256) >> 9) + 128; \ - } \ -} - -// Generate variations of RGBToI420. Parameters are r,g,b offsets within a -// pixel, and number of bytes per pixel. -MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4) -MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4) -MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4) -MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3) -MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3) - -static int RGBToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height, - void (*RGBToI420Row)(const uint8* src_row0, - const uint8* src_row1, - uint8* dst_yplane0, - uint8* dst_yplane1, - uint8* dst_u, - uint8* dst_v, - int width)) { - if (src_frame == NULL || dst_y == NULL || - dst_v == NULL || dst_v == NULL) - return -1; - - if (height < 0) { - height = -height; - src_frame = src_frame + src_stride_frame * (height -1); - src_stride_frame = -src_stride_frame; - } - for (int y = 0; y < height - 1; y += 2) { - RGBToI420Row(src_frame, src_frame + src_stride_frame, - dst_y, dst_y + dst_stride_y, - dst_u, dst_v, - width); - src_frame += src_stride_frame * 2; - dst_y += dst_stride_y * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - if (height & 1) { - RGBToI420Row(src_frame, src_frame, - dst_y, dst_y, - dst_u, dst_v, - width); - } - return 0; -} - -int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return RGBToI420(src_frame, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, ARGBToI420Row_C); -} - -int BGRAToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return RGBToI420(src_frame, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, BGRAToI420Row_C); -} - -int ABGRToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return RGBToI420(src_frame, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, ABGRToI420Row_C); -} - -int RGB24ToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return RGBToI420(src_frame, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, RGB24ToI420Row_C); -} - -int RAWToI420(const uint8* src_frame, int src_stride_frame, - uint8* dst_y, int dst_stride_y, - uint8* dst_u, int dst_stride_u, - uint8* dst_v, int dst_stride_v, - int width, int height) { - return RGBToI420(src_frame, src_stride_frame, - dst_y, dst_stride_y, - dst_u, dst_stride_u, - dst_v, dst_stride_v, - width, height, RAWToI420Row_C); -} - int ARGBToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -830,9 +651,9 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_u, uint8* dst_v, int width); #if defined(HAS_ARGBTOYROW_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 8 == 0) && + (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { ARGBToYRow = ARGBToYRow_SSSE3; } else #endif @@ -841,10 +662,10 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, } #if defined(HAS_ARGBTOUVROW_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 8 == 0) && + (width % 16 == 0) && IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && - IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) && - IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) { + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } else #endif @@ -853,17 +674,229 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, } for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); ARGBToYRow(src_frame, dst_y, width); ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); - ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); src_frame += src_stride_frame * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - ARGBToYRow(src_frame, dst_y, width); ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int BGRAToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_BGRATOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = BGRAToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = BGRAToYRow_C; + } +#if defined(HAS_BGRATOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = BGRAToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = BGRAToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int ABGRToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_ABGRTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = ABGRToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = ABGRToYRow_C; + } +#if defined(HAS_ABGRTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = ABGRToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = ABGRToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int RGB24ToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_RGB24TOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = RGB24ToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = RGB24ToYRow_C; + } +#if defined(HAS_RGB24TOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = RGB24ToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = RGB24ToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + } + return 0; +} + +int RAWToI420(const uint8* src_frame, int src_stride_frame, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height) { + if (height < 0) { + height = -height; + src_frame = src_frame + (height - 1) * src_stride_frame; + src_stride_frame = -src_stride_frame; + } + void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); + void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#if defined(HAS_RAWTOYROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { + ARGBToYRow = RAWToYRow_SSSE3; + } else +#endif + { + ARGBToYRow = RAWToYRow_C; + } +#if defined(HAS_RAWTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = RAWToUVRow_SSSE3; + } else +#endif + { + ARGBToUVRow = RAWToUVRow_C; + } + + for (int y = 0; y < (height - 1); y += 2) { + ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); + ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width); + src_frame += src_stride_frame * 2; + dst_y += dst_stride_y * 2; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + if (height & 1) { + ARGBToUVRow(src_frame, 0, dst_u, dst_v, width); + ARGBToYRow(src_frame, dst_y, width); } return 0; } diff --git a/source/cpu_id.cc b/source/cpu_id.cc index fc388ba83..cc44e2158 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -15,9 +15,6 @@ #include #endif -// Internal flag to indicate cpuid is initialized. -static const int kCpuInitialized = 16; - // TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static inline void __cpuid(int cpu_info[4], int info_type) { @@ -64,11 +61,11 @@ static void InitCpuFlags() { void MaskCpuFlags(int enable_flags) { InitCpuFlags(); - cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized; + cpu_info_ &= enable_flags; } bool TestCpuFlag(int flag) { - if (!cpu_info_) { + if (0 == cpu_info_) { InitCpuFlags(); } return cpu_info_ & flag ? true : false; diff --git a/source/format_conversion.cc b/source/format_conversion.cc index db106bd4d..958f44c40 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -14,6 +14,8 @@ #include "video_common.h" #include "row.h" +#define kMaxStride (2048 * 4) + namespace libyuv { // Note: to do this with Neon vld4.8 would load ARGB values into 4 registers @@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { + if (width * 4 > kMaxStride) { + return -1; + } // Negative height means invert the image. if (height < 0) { height = -height; @@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#define kMaxStride (2048 * 4) SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + #if defined(HAS_ARGBTOYROW_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 8 == 0) && + (width % 16 == 0) && IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && - IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) { + IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { ARGBToYRow = ARGBToYRow_SSSE3; -#if defined(HAS_ARGBTOUVROW_SSSE3) - ARGBToUVRow = ARGBToUVRow_SSSE3; -#else - ARGBToUVRow = ARGBToUVRow_C; -#endif } else #endif { ARGBToYRow = ARGBToYRow_C; + } +#if defined(HAS_ARGBTOUVROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) && + IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && + IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; + } else +#endif + { ARGBToUVRow = ARGBToUVRow_C; } @@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, BayerRow0(src_bayer, src_stride_bayer, row, width); BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); - ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); src_bayer += src_stride_bayer * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer, // TODO(fbarchard): Make sure this filters properly if (height & 1) { BayerRow0(src_bayer, src_stride_bayer, row, width); - ARGBToYRow(row, dst_y, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); } return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b7984c086..a7e3e38a6 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; -// Shuffle table for converting BG24 to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u -}; - -// Shuffle table for converting RAW to ARGB. -extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { - 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u -}; - #if defined(WIN32) && !defined(COVERAGE_ENABLED) #define HAS_SPLITUV_SSE2 __declspec(naked) @@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { __asm__ volatile ( - "vdup.u32 {q0}, %2 \n" // duplicate 4 ints + "vdup.u32 q0, %2 \n" // duplicate 4 ints "1:\n" "vst1.u32 {q0}, [%0]! \n" // store "subs %1, %1, #16 \n" // 16 processed per loop @@ -1282,85 +1272,6 @@ __asm { } } -#define HAS_BG24TOARGBROW_SSSE3 -__declspec(naked) -static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_bg24 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskBG24ToARGB - - convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 - movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - ja convertloop - ret - } -} - -#define HAS_RAWTOARGBROW_SSSE3 -__declspec(naked) -static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_raw - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm7, xmm7 // generate mask 0xff000000 - pslld xmm7, 24 - movdqa xmm6, _kShuffleMaskRAWToARGB - - convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm3, [eax + 32] - lea eax, [eax + 48] - movdqa xmm2, xmm3 - palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} - pshufb xmm2, xmm6 - por xmm2, xmm7 - palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} - pshufb xmm0, xmm6 - movdqa [edx + 32], xmm2 - por xmm0, xmm7 - pshufb xmm1, xmm6 - movdqa [edx], xmm0 - por xmm1, xmm7 - palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} - pshufb xmm3, xmm6 - movdqa [edx + 16], xmm1 - por xmm3, xmm7 - movdqa [edx + 48], xmm3 - lea edx, [edx + 64] - sub ecx, 16 - ja convertloop - ret - } -} #elif (defined(__x86_64__) || defined(__i386__)) && \ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) @@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, ); } -#define HAS_BG24TOARGBROW_SSSE3 -static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, - int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src_bg24), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskBG24ToARGB) // %3 - : "memory" -); -} - -#define HAS_RAWTOARGBROW_SSSE3 -static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, - int pix) { - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm7\n" - "movdqa (%3),%%xmm6\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm6,%%xmm2\n" - "por %%xmm7,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm6,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm7,%%xmm0\n" - "pshufb %%xmm6,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm7,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm6,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm7,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" - : "+r"(src_raw), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "r"(kShuffleMaskRAWToARGB) // %3 - : "memory" -); -} - #endif static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { @@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, return 0; } - -static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - uint8 r = src_raw[0]; - uint8 g = src_raw[1]; - uint8 b = src_raw[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb += 4; - src_raw += 3; - } -} - -// Convert RAW to ARGB. -int RAWToARGB(const uint8* src_raw, int src_stride_raw, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (height < 0) { - height = -height; - src_raw = src_raw + (height - 1) * src_stride_raw; - src_stride_raw = -src_stride_raw; - } - void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); -#if defined(HAS_RAWTOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - RAWToARGBRow = RAWToARGBRow_SSSE3; - } else -#endif - { - RAWToARGBRow = RAWToARGBRow_C; - } - - for (int y = 0; y < height; ++y) { - RAWToARGBRow(src_raw, dst_argb, width); - src_raw += src_stride_raw; - dst_argb += dst_stride_argb; - } - return 0; -} - -static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { - uint8 b = src_bg24[0]; - uint8 g = src_bg24[1]; - uint8 r = src_bg24[2]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = 255u; - dst_argb[3] = 255u; - dst_argb += 4; - src_bg24 += 3; - } -} - -// Convert BG24 to ARGB. -int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - if (height < 0) { - height = -height; - src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; - src_stride_bg24 = -src_stride_bg24; - } - void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); -#if defined(HAS_BG24TOARGBROW_SSSE3) - if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && - (width % 16 == 0) && - IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - BG24ToARGBRow = BG24ToARGBRow_SSSE3; - } else -#endif - { - BG24ToARGBRow = BG24ToARGBRow_C; - } - - for (int y = 0; y < height; ++y) { - BG24ToARGBRow(src_bg24, dst_argb, width); - src_bg24 += src_stride_bg24; - dst_argb += dst_stride_argb; - } - return 0; -} - - static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { for (int x = 0; x < pix; ++x) { // To support in-place conversion. @@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); return 0; } + +// Convert RAW to ARGB. +int RAWToARGB(const uint8* src_raw, int src_stride_raw, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } else +#endif + { + RAWToARGBRow = RAWToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, dst_argb, width); + src_raw += src_stride_raw; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert BG24 to ARGB. +int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (height < 0) { + height = -height; + src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; + src_stride_bg24 = -src_stride_bg24; + } + void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); +#if defined(HAS_BG24TOARGBROW_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && + IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { + BG24ToARGBRow = BG24ToARGBRow_SSSE3; + } else +#endif + { + BG24ToARGBRow = BG24ToARGBRow_C; + } + + for (int y = 0; y < height; ++y) { + BG24ToARGBRow(src_bg24, dst_argb, width); + src_bg24 += src_stride_bg24; + dst_argb += dst_stride_argb; + } + return 0; +} + } // namespace libyuv diff --git a/source/rotate.cc b/source/rotate.cc index a1b05e898..43a0072b6 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ); #if defined (__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +#define HAS_TRANSPOSE_WX8_FAST_SSSE3 +static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile( +"1:" + // Read in the data from the source pointer. + // First round of bit swap. + "movdqa (%0),%%xmm0\n" + "movdqa (%0,%3),%%xmm1\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm0,%%xmm8\n" + "punpcklbw %%xmm1,%%xmm0\n" + "punpckhbw %%xmm1,%%xmm8\n" + "movdqa (%0),%%xmm2\n" + "movdqa %%xmm0,%%xmm1\n" + "movdqa %%xmm8,%%xmm9\n" + "palignr $0x8,%%xmm1,%%xmm1\n" + "palignr $0x8,%%xmm9,%%xmm9\n" + "movdqa (%0,%3),%%xmm3\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm2,%%xmm10\n" + "punpcklbw %%xmm3,%%xmm2\n" + "punpckhbw %%xmm3,%%xmm10\n" + "movdqa %%xmm2,%%xmm3\n" + "movdqa %%xmm10,%%xmm11\n" + "movdqa (%0),%%xmm4\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "palignr $0x8,%%xmm11,%%xmm11\n" + "movdqa (%0,%3),%%xmm5\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm4,%%xmm12\n" + "punpcklbw %%xmm5,%%xmm4\n" + "punpckhbw %%xmm5,%%xmm12\n" + "movdqa %%xmm4,%%xmm5\n" + "movdqa %%xmm12,%%xmm13\n" + "movdqa (%0),%%xmm6\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "palignr $0x8,%%xmm13,%%xmm13\n" + "movdqa (%0,%3),%%xmm7\n" + "lea (%0,%3,2),%0\n" + "movdqa %%xmm6,%%xmm14\n" + "punpcklbw %%xmm7,%%xmm6\n" + "punpckhbw %%xmm7,%%xmm14\n" + "neg %3\n" + "movdqa %%xmm6,%%xmm7\n" + "movdqa %%xmm14,%%xmm15\n" + "lea 0x10(%0,%3,8),%0\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + "neg %3\n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0\n" + "punpcklwd %%xmm3,%%xmm1\n" + "movdqa %%xmm0,%%xmm2\n" + "movdqa %%xmm1,%%xmm3\n" + "palignr $0x8,%%xmm2,%%xmm2\n" + "palignr $0x8,%%xmm3,%%xmm3\n" + "punpcklwd %%xmm6,%%xmm4\n" + "punpcklwd %%xmm7,%%xmm5\n" + "movdqa %%xmm4,%%xmm6\n" + "movdqa %%xmm5,%%xmm7\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "punpcklwd %%xmm10,%%xmm8\n" + "punpcklwd %%xmm11,%%xmm9\n" + "movdqa %%xmm8,%%xmm10\n" + "movdqa %%xmm9,%%xmm11\n" + "palignr $0x8,%%xmm10,%%xmm10\n" + "palignr $0x8,%%xmm11,%%xmm11\n" + "punpcklwd %%xmm14,%%xmm12\n" + "punpcklwd %%xmm15,%%xmm13\n" + "movdqa %%xmm12,%%xmm14\n" + "movdqa %%xmm13,%%xmm15\n" + "palignr $0x8,%%xmm14,%%xmm14\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0\n" + "movq %%xmm0,(%1)\n" + "movdqa %%xmm0,%%xmm4\n" + "palignr $0x8,%%xmm4,%%xmm4\n" + "movq %%xmm4,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm6,%%xmm2\n" + "movdqa %%xmm2,%%xmm6\n" + "movq %%xmm2,(%1)\n" + "palignr $0x8,%%xmm6,%%xmm6\n" + "punpckldq %%xmm5,%%xmm1\n" + "movq %%xmm6,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "movdqa %%xmm1,%%xmm5\n" + "movq %%xmm1,(%1)\n" + "palignr $0x8,%%xmm5,%%xmm5\n" + "movq %%xmm5,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm7,%%xmm3\n" + "movq %%xmm3,(%1)\n" + "movdqa %%xmm3,%%xmm7\n" + "palignr $0x8,%%xmm7,%%xmm7\n" + "movq %%xmm7,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm12,%%xmm8\n" + "movq %%xmm8,(%1)\n" + "movdqa %%xmm8,%%xmm12\n" + "palignr $0x8,%%xmm12,%%xmm12\n" + "movq %%xmm12,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm14,%%xmm10\n" + "movdqa %%xmm10,%%xmm14\n" + "movq %%xmm10,(%1)\n" + "palignr $0x8,%%xmm14,%%xmm14\n" + "punpckldq %%xmm13,%%xmm9\n" + "movq %%xmm14,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "movdqa %%xmm9,%%xmm13\n" + "movq %%xmm9,(%1)\n" + "palignr $0x8,%%xmm13,%%xmm13\n" + "movq %%xmm13,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "punpckldq %%xmm15,%%xmm11\n" + "movq %%xmm11,(%1)\n" + "movdqa %%xmm11,%%xmm15\n" + "palignr $0x8,%%xmm15,%%xmm15\n" + "movq %%xmm15,(%1,%4)\n" + "lea (%1,%4,2),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(static_cast(src_stride)), // %3 + "r"(static_cast(dst_stride)) // %4 + : "memory" +); +} + #define HAS_TRANSPOSE_UVWX8_SSE2 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, @@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride, #if defined(HAS_TRANSPOSE_WX8_NEON) if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && (width % 8 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && - IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { + IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { TransposeWx8 = TransposeWx8_NEON; TransposeWxH = TransposeWxH_C; } else #endif +#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) + if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && + (width % 16 == 0) && + IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { + TransposeWx8 = TransposeWx8_FAST_SSSE3; + TransposeWxH = TransposeWxH_C; + } else +#endif #if defined(HAS_TRANSPOSE_WX8_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && (width % 8 == 0) && - IS_ALIGNED(src, 16) && (src_stride % 8 == 0) && - IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) { + IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && + IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { TransposeWx8 = TransposeWx8_SSSE3; TransposeWxH = TransposeWxH_C; } else diff --git a/source/row.h b/source/row.h index 1563e95e3..85343c563 100644 --- a/source/row.h +++ b/source/row.h @@ -13,17 +13,91 @@ #include "libyuv/basic_types.h" +// The following are available on all x86 platforms #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) #define HAS_ARGBTOYROW_SSSE3 +#define HAS_BG24TOARGBROW_SSSE3 +#define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 +#define HAS_RGB24TOUVROW_SSSE3 +#define HAS_RAWTOUVROW_SSSE3 #endif +// The following are available only on Windows #if defined(WIN32) \ && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) +#define HAS_BGRATOYROW_SSSE3 +#define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 #endif extern "C" { +#ifdef HAS_ARGBTOYROW_SSSE3 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) +#define HASRGB24TOYROW_SSSE3 +#endif +#ifdef HASRGB24TOYROW_SSSE3 +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +#endif +void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + +#ifdef HAS_BG24TOARGBROW_SSSE3 +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix); +#endif +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); +void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix); + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var +#else +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#define TALIGN16(t, var) t var __attribute__((aligned(16))) +#endif + +#ifdef OSX +extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); +extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); +extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); +#else +extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); +extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); +extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); +#endif void FastConvertYUVToRGB32Row(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -52,34 +126,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf, uint8* rgb_buf, int width); -#ifdef HAS_ARGBTOYROW_SSSE3 -void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); -#endif -void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); - - -#if defined(_MSC_VER) -#define SIMD_ALIGNED(var) __declspec(align(16)) var -#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var -#else -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -#define TALIGN16(t, var) t var __attribute__((aligned(16))) -#endif - -#ifdef OSX -extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); -#else -extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]); -extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]); -#endif - // Method to force C version. //#define USE_MMX 0 //#define USE_SSE2 0 diff --git a/source/row_posix.cc b/source/row_posix.cc index 40e636cc2..88ce475b4 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = { 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u }; +// Shuffle table for converting BG24 to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile( "movdqa (%3),%%xmm7\n" @@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } #endif -static inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +#ifdef HAS_BG24TOARGBROW_SSSE3 +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_bg24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskBG24ToARGB) // %3 + : "memory" +); } -static inline int RGBToU(uint8 r, uint8 g, uint8 b) { - return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; -} -static inline int RGBToV(uint8 r, uint8 g, uint8 b) { - return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; -} - -void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { - for (int x = 0; x < width; ++x) { - dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); - src_argb0 += 4; - dst_y += 1; - } -} - -void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb1 = src_argb0 + src_stride_argb; - for (int x = 0; x < width - 1; x += 2) { - uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; - uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; - uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb0 += 8; - src_argb1 += 8; - dst_u += 1; - dst_v += 1; - } - if (width & 1) { - uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; - uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; - uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - } +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile( + "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 + "pslld $0x18,%%xmm7\n" + "movdqa (%3),%%xmm6\n" +"1:" + "movdqa (%0),%%xmm0\n" + "movdqa 0x10(%0),%%xmm1\n" + "movdqa 0x20(%0),%%xmm3\n" + "lea 0x30(%0),%0\n" + "movdqa %%xmm3,%%xmm2\n" + "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm6,%%xmm2\n" + "por %%xmm7,%%xmm2\n" + "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm6,%%xmm0\n" + "movdqa %%xmm2,0x20(%1)\n" + "por %%xmm7,%%xmm0\n" + "pshufb %%xmm6,%%xmm1\n" + "movdqa %%xmm0,(%1)\n" + "por %%xmm7,%%xmm1\n" + "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm6,%%xmm3\n" + "movdqa %%xmm1,0x10(%1)\n" + "por %%xmm7,%%xmm3\n" + "movdqa %%xmm3,0x30(%1)\n" + "lea 0x40(%1),%1\n" + "sub $0x10,%2\n" + "ja 1b\n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(kShuffleMaskRAWToARGB) // %3 + : "memory" +); } +#endif #if defined(__x86_64__) @@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf, } #endif + } // extern "C" diff --git a/source/row_table.cc b/source/row_table.cc index 7ce4a7ebd..022d9f88c 100644 --- a/source/row_table.cc +++ b/source/row_table.cc @@ -10,6 +10,8 @@ #include "row.h" +#define kMaxStride (2048 * 4) + extern "C" { #define MAKETABLE(NAME) \ @@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY) MAKETABLE(_kCoefficientsAbgrY) #endif + +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + uint8 r = src_raw[0]; + uint8 g = src_raw[1]; + uint8 b = src_raw[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb += 4; + src_raw += 3; + } +} + +void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) { + for (int x = 0; x < pix; ++x) { + uint8 b = src_bg24[0]; + uint8 g = src_bg24[1]; + uint8 r = src_bg24[2]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = 255u; + dst_argb[3] = 255u; + dst_argb += 4; + src_bg24 += 3; + } +} + +// C versions do the same +void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BG24ToARGBRow_C(src_argb, row, pix); + ARGBToYRow_C(row, dst_y, pix); +} + +void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + RAWToARGBRow_C(src_argb, row, pix); + ARGBToYRow_C(row, dst_y, pix); +} + +void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_C(src_argb, row, pix); + BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_C(src_argb, row, pix); + RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +static inline int RGBToY(uint8 r, uint8 g, uint8 b) { + return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; +} + +static inline int RGBToU(uint8 r, uint8 g, uint8 b) { + return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; +} +static inline int RGBToV(uint8 r, uint8 g, uint8 b) { + return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; +} + +#define MAKEROWY(NAME,R,G,B) \ +void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ + for (int x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += 4; \ + dst_y += 1; \ + } \ +} \ +void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + for (int x = 0; x < width - 1; x += 2) { \ + uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \ + src_rgb1[B] + src_rgb1[B + 4]) >> 2; \ + uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \ + src_rgb1[G] + src_rgb1[G + 4]) >> 2; \ + uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \ + src_rgb1[R] + src_rgb1[R + 4]) >> 2; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb0 += 8; \ + src_rgb1 += 8; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ + uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ + uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ +} + +MAKEROWY(ARGB,2,1,0) +MAKEROWY(BGRA,1,2,3) +MAKEROWY(ABGR,0,1,2) + +#if defined(HAS_RAWTOYROW_SSSE3) + +void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} + +void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} + +#endif + +#if defined(HAS_RAWTOUVROW_SSSE3) +#if defined(HAS_ARGBTOUVROW_SSSE3) +void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} + +#else + +void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BG24ToARGBRow_SSSE3(src_argb, row, pix); + BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + RAWToARGBRow_SSSE3(src_argb, row, pix); + RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix); +} + +#endif +#endif + } // extern "C" diff --git a/source/row_win.cc b/source/row_win.cc index c90372a14..2bc5fb136 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -16,59 +16,160 @@ extern "C" { #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var // Constant multiplication table for converting ARGB to I400. -extern "C" TALIGN16(const int8, kRGBToY[16]) = { +extern "C" TALIGN16(const int8, kARGBToY[16]) = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; -extern "C" TALIGN16(const int8, kRGBToU[16]) = { +extern "C" TALIGN16(const int8, kARGBToU[16]) = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -extern "C" TALIGN16(const int8, kRGBToV[16]) = { +extern "C" TALIGN16(const int8, kARGBToV[16]) = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; +// Constants for BGRA +extern "C" TALIGN16(const int8, kBGRAToY[16]) = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +extern "C" TALIGN16(const int8, kBGRAToU[16]) = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +extern "C" TALIGN16(const int8, kBGRAToV[16]) = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +extern "C" TALIGN16(const int8, kABGRToY[16]) = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +extern "C" TALIGN16(const int8, kABGRToU[16]) = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +extern "C" TALIGN16(const int8, kABGRToV[16]) = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + extern "C" TALIGN16(const uint8, kAddY16[16]) = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, }; extern "C" TALIGN16(const uint8, kAddUV128[16]) = { - 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u, - 128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; +// Shuffle table for converting BG24 to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u +}; + +// Shuffle table for converting RAW to ARGB. +extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u +}; + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_y - mov ecx, [esp + 12] // pix - movdqa xmm7, _kRGBToY + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kARGBToY movdqa xmm6, _kAddY16 - pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff - psrld xmm5, 16 convertloop : - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - pmaddubsw xmm0, xmm7 - lea eax, [eax + 32] - pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra - palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx - paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx - pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 - palignr xmm3, xmm1, 2 - paddw xmm3, xmm1 - pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 - packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA - psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx - packuswb xmm2, xmm2 - paddb xmm2, xmm6 - movq qword ptr [edx], xmm2 - lea edx, [edx + 8] - sub ecx, 8 - ja convertloop + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kBGRAToY + movdqa xmm6, _kAddY16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm7, _kABGRToY + movdqa xmm6, _kAddY16 + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm7 + pmaddubsw xmm1, xmm7 + pmaddubsw xmm2, xmm7 + pmaddubsw xmm3, xmm7 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm6 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop ret } } @@ -84,55 +185,52 @@ __asm { mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - movdqa xmm7, _kRGBToU - movdqa xmm6, _kRGBToV + movdqa xmm7, _kARGBToU + movdqa xmm6, _kARGBToV movdqa xmm5, _kAddUV128 - pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff - psrld xmm4, 16 + sub edi, edx // stride from u to v convertloop : - // step 1 - subsample 8x2 argb pixels to 4x1 - movdqa xmm0, [eax] // 32x2 -> 32x1 + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // 32x1 -> 16x1 + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 - shufps xmm2, xmm1, 0xdd - pavgb xmm0, xmm2 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except - // instead of 8 different pixels, its 4 pixels of U and 4 of V + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm7 // U - pmaddubsw xmm1, xmm6 // V + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned - palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx - paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx - pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 - - palignr xmm3, xmm1, 2 - paddw xmm3, xmm1 - pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00 - - psraw xmm2, 8 - psraw xmm3, 8 - packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA - paddb xmm2, xmm5 // -> unsigned - packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V - - // step 3 - store 4 U and 4 V values - movd dword ptr [edx], xmm2 // U - lea edx, [edx + 4] - pshufd xmm0, xmm2, 0x55 // V - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] - sub ecx, 8 + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 ja convertloop pop edi pop esi @@ -140,45 +238,208 @@ __asm { } } -static inline int RGBToY(uint8 r, uint8 g, uint8 b) { - return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16; -} +__declspec(naked) +void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kBGRAToU + movdqa xmm6, _kBGRAToV + movdqa xmm5, _kAddUV128 + sub edi, edx // stride from u to v -static inline int RGBToU(uint8 r, uint8 g, uint8 b) { - return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128; -} -static inline int RGBToV(uint8 r, uint8 g, uint8 b) { - return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128; -} + convertloop : + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 -void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { - for (int x = 0; x < width; ++x) { - dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]); - src_argb0 += 4; - dst_y += 1; + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret } } -void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - const uint8* src_argb1 = src_argb0 + src_stride_argb; - for (int x = 0; x < width - 1; x += 2) { - uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2; - uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2; - uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); - src_argb0 += 8; - src_argb1 += 8; - dst_u += 1; - dst_v += 1; +__declspec(naked) +void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, _kABGRToU + movdqa xmm6, _kABGRToV + movdqa xmm5, _kAddUV128 + sub edi, edx // stride from u to v + + convertloop : + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret } - if (width & 1) { - uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1; - uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1; - uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1; - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); +} + +__declspec(naked) +void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { +__asm { + mov eax, [esp + 4] // src_bg24 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, _kShuffleMaskBG24ToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret + } +} + +__declspec(naked) +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_raw + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm7, xmm7 // generate mask 0xff000000 + pslld xmm7, 24 + movdqa xmm6, _kShuffleMaskRAWToARGB + + convertloop : + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm3, [eax + 32] + lea eax, [eax + 48] + movdqa xmm2, xmm3 + palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} + pshufb xmm2, xmm6 + por xmm2, xmm7 + palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} + pshufb xmm0, xmm6 + movdqa [edx + 32], xmm2 + por xmm0, xmm7 + pshufb xmm1, xmm6 + movdqa [edx], xmm0 + por xmm1, xmm7 + palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} + pshufb xmm3, xmm6 + movdqa [edx + 16], xmm1 + por xmm3, xmm7 + movdqa [edx + 48], xmm3 + lea edx, [edx + 64] + sub ecx, 16 + ja convertloop + ret } }