From 1096543eaa1e596a93ba5d3863e637dc489e32cc Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 8 Mar 2013 23:22:32 +0000 Subject: [PATCH] ARGBShuffle AVX2 BUG=196 TESTED=BGRAToARGB* Review URL: https://webrtc-codereview.appspot.com/1171006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@596 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 7 + include/libyuv/row.h | 43 +++--- include/libyuv/version.h | 2 +- source/convert_argb.cc | 122 ++++------------- source/convert_from_argb.cc | 40 ++---- source/planar_functions.cc | 65 +++++++++ source/row_any.cc | 25 ++++ source/row_common.cc | 88 ++++--------- source/row_neon.cc | 104 ++++----------- source/row_posix.cc | 208 +++++++++++------------------ source/row_win.cc | 212 ++++++++++++------------------ 12 files changed, 366 insertions(+), 552 deletions(-) diff --git a/README.chromium b/README.chromium index 9fc6df6c6..67c782b57 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 595 +Version: 596 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index f74bf0813..73625b1cd 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -340,6 +340,13 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #define HAS_ARGBAFFINEROW_SSE2 #endif // LIBYUV_DISABLE_X86 +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +// shuffler is 16 bytes and must be aligned. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5bf7b6481..25017a3a3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -35,8 +35,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) // Conversions. -#define HAS_ABGRTOARGBROW_SSSE3 -#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 @@ -47,12 +45,10 @@ extern "C" { #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 -#define HAS_ARGBTORGBAROW_SSSE3 #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_SSE2 @@ -84,7 +80,6 @@ extern "C" { #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 -#define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 @@ -98,6 +93,7 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSSE3 // Effects #define HAS_ARGBADDROW_SSE2 @@ -140,6 +136,7 @@ extern "C" { #define HAS_HALFROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 // Effects #define HAS_ARGBATTENUATEROW_AVX2 @@ -177,7 +174,6 @@ extern "C" { // The following are available on Neon platforms #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_ABGRTOARGBROW_NEON #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON @@ -192,13 +188,11 @@ extern "C" { #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTORGBAROW_NEON #define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYROW_NEON -#define HAS_BGRATOARGBROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON @@ -233,7 +227,6 @@ extern "C" { #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON -#define HAS_RGBATOARGBROW_NEON #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON @@ -633,11 +626,24 @@ void SetRow_C(uint8* dst, uint32 v32, int count); void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, int height); -void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); -void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); -void ABGRToARGBRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix); -void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix); +// ARGBShufflers for BGRAToARGB etc. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); + void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); @@ -645,9 +651,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); -void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix); -void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix); -void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix); + void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix); @@ -655,9 +659,6 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, int pix); -void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); -void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); -void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix); void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); @@ -680,14 +681,12 @@ void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb, void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb, int pix); -void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 710679854..94bba7bf6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 595 +#define LIBYUV_VERSION 596 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 576c136bc..6f799b582 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -15,6 +15,7 @@ #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif +#include "libyuv/planar_functions.h" #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/video_common.h" @@ -295,41 +296,30 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Shuffle table for converting BGRA to ARGB. +static const uvec8 kShuffleMaskBGRAToARGB = { + 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u +}; + +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + +// Shuffle table for converting RGBA to ARGB. +static const uvec8 kShuffleMaskRGBAToARGB = { + 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u +}; + // Convert BGRA to ARGB. LIBYUV_API int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, uint8* dst_argb, int dst_stride_argb, int width, int height) { - if (!src_bgra || !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_bgra = src_bgra + (height - 1) * src_stride_bgra; - src_stride_bgra = -src_stride_bgra; - } - void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix) = - BGRAToARGBRow_C; -#if defined(HAS_BGRATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - BGRAToARGBRow = BGRAToARGBRow_SSSE3; - } -#elif defined(HAS_BGRATOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - BGRAToARGBRow = BGRAToARGBRow_NEON; - } -#endif - - for (int y = 0; y < height; ++y) { - BGRAToARGBRow(src_bgra, dst_argb, width); - src_bgra += src_stride_bgra; - dst_argb += dst_stride_argb; - } - return 0; + return ARGBShuffle(src_bgra, src_stride_bgra, + dst_argb, dst_stride_argb, + reinterpret_cast(&kShuffleMaskBGRAToARGB), + width, height); } // Convert ABGR to ARGB. @@ -337,40 +327,10 @@ LIBYUV_API int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, uint8* dst_argb, int dst_stride_argb, int width, int height) { - if (!src_abgr || !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_abgr = src_abgr + (height - 1) * src_stride_abgr; - src_stride_abgr = -src_stride_abgr; - } - void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix) = - ABGRToARGBRow_C; -#if defined(HAS_ABGRTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { - // TODO(fbarchard): Port to posix. -#if defined(_M_IX86) - ABGRToARGBRow = ABGRToARGBRow_Unaligned_SSSE3; -#endif - if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) - ABGRToARGBRow = ABGRToARGBRow_SSSE3; - } -#elif defined(HAS_ABGRTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ABGRToARGBRow = ABGRToARGBRow_NEON; - } -#endif - - for (int y = 0; y < height; ++y) { - ABGRToARGBRow(src_abgr, dst_argb, width); - src_abgr += src_stride_abgr; - dst_argb += dst_stride_argb; - } - return 0; + return ARGBShuffle(src_abgr, src_stride_abgr, + dst_argb, dst_stride_argb, + reinterpret_cast(&kShuffleMaskABGRToARGB), + width, height); } // Convert RGBA to ARGB. @@ -378,36 +338,10 @@ LIBYUV_API int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba, uint8* dst_argb, int dst_stride_argb, int width, int height) { - if (!src_rgba || !dst_argb || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_rgba = src_rgba + (height - 1) * src_stride_rgba; - src_stride_rgba = -src_stride_rgba; - } - void (*RGBAToARGBRow)(const uint8* src_rgba, uint8* dst_argb, int pix) = - RGBAToARGBRow_C; -#if defined(HAS_RGBATOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - RGBAToARGBRow = RGBAToARGBRow_SSSE3; - } -#elif defined(HAS_RGBATOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - RGBAToARGBRow = RGBAToARGBRow_NEON; - } -#endif - - for (int y = 0; y < height; ++y) { - RGBAToARGBRow(src_rgba, dst_argb, width); - src_rgba += src_stride_rgba; - dst_argb += dst_stride_argb; - } - return 0; + return ARGBShuffle(src_rgba, src_stride_rgba, + dst_argb, dst_stride_argb, + reinterpret_cast(&kShuffleMaskRGBAToARGB), + width, height); } // Convert RGB24 to ARGB. diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2563728d4..228876272 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -667,42 +667,20 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, return 0; } +// Shuffle table for converting ARGB to RGBA. +static const uvec8 kShuffleMaskARGBToRGBA = { + 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u +}; + // Convert ARGB to RGBA. LIBYUV_API int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, uint8* dst_rgba, int dst_stride_rgba, int width, int height) { - if (!src_argb || !dst_rgba || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - void (*ARGBToRGBARow)(const uint8* src_argb, uint8* dst_rgba, int pix) = - ARGBToRGBARow_C; -#if defined(HAS_ARGBTORGBAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { - ARGBToRGBARow = ARGBToRGBARow_SSSE3; - } -#elif defined(HAS_ARGBTORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - ARGBToRGBARow = ARGBToRGBARow_NEON; - } -#endif - - for (int y = 0; y < height; ++y) { - ARGBToRGBARow(src_argb, dst_rgba, width); - src_argb += src_stride_argb; - dst_rgba += dst_stride_rgba; - } - return 0; + return ARGBShuffle(src_argb, src_stride_argb, + dst_rgba, dst_stride_rgba, + reinterpret_cast(&kShuffleMaskARGBToRGBA), + width, height); } // Convert ARGB To RGB24. diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0b5f7823b..ff4b38632 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1603,6 +1603,71 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, return 0; } +// Shuffle ARGB channel order. e.g. BGRA to ARGB. +LIBYUV_API +int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_argb, int dst_stride_argb, + const uint8* shuffler, int width, int height) { + if (!src_bgra || !dst_argb || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_bgra = src_bgra + (height - 1) * src_stride_bgra; + src_stride_bgra = -src_stride_bgra; + } + // Coalesce contiguous rows. + if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { + return ARGBShuffle(src_bgra, 0, dst_argb, 0, shuffler, width * height, 1); + } + void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, + const uint8* shuffler, int pix) = ARGBShuffleRow_C; +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBShuffleRow = ARGBShuffleRow_SSSE3; + } + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + clear = true; + ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGBShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif + + for (int y = 0; y < height; ++y) { + ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); + src_bgra += src_stride_bgra; + dst_argb += dst_stride_argb; + } +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_any.cc b/source/row_any.cc index 723a56652..a1f90f06a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -266,6 +266,7 @@ YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C, YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, 4, 4, 7) #endif +#undef YANY // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ @@ -444,6 +445,30 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, #endif #undef MATHROW_ANY +// Shuffle may want to work in place, so last16 method can not be used. +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_argb, uint8* dst_argb, \ + const uint8* shuffler, int width) { \ + int n = width & ~MASK; \ + ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + ARGBTOY_C(src_argb + n * SBPP, \ + dst_argb + n * BPP, shuffler, width & MASK); \ + } + +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, + ARGBShuffleRow_C, 4, 4, 7) +#endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, + ARGBShuffleRow_C, 4, 4, 15) +#endif +#ifdef HAS_ARGBSHUFFLEROW_NEON +YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, + ARGBShuffleRow_C, 4, 4, 3) +#endif +#undef YANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 557d0ffaf..8490846dd 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -30,54 +30,6 @@ static inline void WRITEWORD(uint8* p, uint32 v) { } #endif -void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) { - for (int x = 0; x < width; ++x) { - // To support in-place conversion. - uint8 a = src_bgra[0]; - uint8 r = src_bgra[1]; - uint8 g = src_bgra[2]; - uint8 b = src_bgra[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_bgra += 4; - } -} - -void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) { - for (int x = 0; x < width; ++x) { - // To support in-place conversion. - uint8 r = src_abgr[0]; - uint8 g = src_abgr[1]; - uint8 b = src_abgr[2]; - uint8 a = src_abgr[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_abgr += 4; - } -} - -void RGBAToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) { - for (int x = 0; x < width; ++x) { - // To support in-place conversion. - uint8 a = src_abgr[0]; - uint8 b = src_abgr[1]; - uint8 g = src_abgr[2]; - uint8 r = src_abgr[3]; - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - dst_argb += 4; - src_abgr += 4; - } -} - void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { for (int x = 0; x < width; ++x) { uint8 b = src_rgb24[0]; @@ -152,21 +104,6 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, } } -void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width) { - for (int x = 0; x < width; ++x) { - uint8 b = src_argb[0]; - uint8 g = src_argb[1]; - uint8 r = src_argb[2]; - uint8 a = src_argb[3]; - dst_rgb[0] = a; - dst_rgb[1] = b; - dst_rgb[2] = g; - dst_rgb[3] = r; - dst_rgb += 4; - src_argb += 4; - } -} - void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { for (int x = 0; x < width; ++x) { uint8 b = src_argb[0]; @@ -1569,7 +1506,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { uint32 g = src_argb[1]; uint32 r = src_argb[2]; const uint32 a = src_argb[3]; - const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.16 fixed point + const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point b = (b * ia) >> 8; g = (g * ia) >> 8; r = (r * ia) >> 8; @@ -1689,6 +1626,29 @@ void ARGBToBayerRow_C(const uint8* src_argb, } } +// Use first 4 shuffler values to reorder ARGB channels. +void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + int index0 = shuffler[0]; + int index1 = shuffler[1]; + int index2 = shuffler[2]; + int index3 = shuffler[3]; + // Shuffle a row of ARGB. + for (int x = 0; x < pix; ++x) { + // To support in-place conversion. + uint8 b = src_argb[index0]; + uint8 g = src_argb[index1]; + uint8 r = src_argb[index2]; + uint8 a = src_argb[index3]; + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_neon.cc b/source/row_neon.cc index 6836f3dff..086630b4d 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -860,58 +860,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ); } -void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { - asm volatile ( - ".p2align 2 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d2 \n" // swap G, R - "vswp.u8 d0, d3 \n" // swap B, A - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { - asm volatile ( - ".p2align 2 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d0, d2 \n" // swap R, B - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_abgr), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); -} - -void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { - asm volatile ( - ".p2align 2 \n" - "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmov.u8 d4, d0 \n" // move A after RGB - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. - "bgt 1b \n" - : "+r"(src_rgba), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4" // Clobber List - ); -} - void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha @@ -1052,23 +1000,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, ); } -void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { - asm volatile ( - ".p2align 2 \n" - "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vmov.u8 d0, d4 \n" // move A before RGB. - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_rgba), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4" // Clobber List - ); -} - void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( ".p2align 2 \n" @@ -1242,22 +1173,41 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, } // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG -void ARGBToBayerRow_NEON(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { asm volatile ( - "vmov.u32 d2[0], %2 \n" // selector + "vmov.u32 d2[0], %3 \n" // selector "1: \n" "vld1.u8 {q0}, [%0]! \n" // load row 4 pixels. - "subs %3, %3, #4 \n" // 4 processed per loop + "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d3, {d0, d1}, d2 \n" // look up 4 pixels "vst1.u32 {d3[0]}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 - "+r"(selector), // %2 - "+r"(pix) // %3 - : - : "cc", "memory", "q0", "q1" // Clobber List + "+r"(pix) // %2 + : "r"(selector), // %3 + : "cc", "memory", "q0", "d2" // Clobber List + ); +} + +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vld1.u8 {q2}, [%3] \n" // shuffler + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load 4 pixels. + "subs %2, %2, #4 \n" // 4 processed per loop + "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels + "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "vst1.u8 {q1}, [%1]! \n" // store 4. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } diff --git a/source/row_posix.cc b/source/row_posix.cc index f35e85204..32b62b933 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -101,26 +101,6 @@ CONST uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -// Shuffle table for converting ABGR to ARGB. -CONST uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; - -// Shuffle table for converting BGRA to ARGB. -CONST uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; - -// Shuffle table for converting RGBA to ARGB. -CONST uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u -}; - -// Shuffle table for converting ARGB to RGBA. -CONST uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u -}; - // Shuffle table for converting ARGB to RGB24. CONST uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u @@ -202,101 +182,6 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, ); } -void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { - asm volatile ( - "movdqa %3,%%xmm5 \n" - "sub %0,%1 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - - : "+r"(src_abgr), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "m"(kShuffleMaskABGRToARGB) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - ); -} - -void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { - asm volatile ( - "movdqa %3,%%xmm5 \n" - "sub %0,%1 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - : "+r"(src_bgra), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "m"(kShuffleMaskBGRAToARGB) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - ); -} - -void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { - asm volatile ( - "movdqa %3,%%xmm5 \n" - "sub %0,%1 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - - : "+r"(src_rgba), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : "m"(kShuffleMaskRGBAToARGB) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - ); -} - -void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { - asm volatile ( - "movdqa %3,%%xmm5 \n" - "sub %0,%1 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - - : "+r"(src_argb), // %0 - "+r"(dst_rgba), // %1 - "+r"(pix) // %2 - : "m"(kShuffleMaskARGBToRGBA) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - ); -} - void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 @@ -4684,15 +4569,15 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( - "sub %0,%1 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pavgb (%0,%3),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pavgb (%0,%3),%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%0,%1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(pix) // %2 @@ -4707,17 +4592,17 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( - "movd %3,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "jg 1b \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 @@ -4729,12 +4614,67 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ); } +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa (%3),%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "movdqa (%3),%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_frame, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %1,%2 \n" ".p2align 4 \n" "1: \n" "movq (%1),%%xmm2 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 902deebe2..3bfad4c85 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -132,26 +132,6 @@ static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -// Shuffle table for converting BGRA to ARGB. -static const uvec8 kShuffleMaskBGRAToARGB = { - 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u -}; - -// Shuffle table for converting ABGR to ARGB. -static const uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; - -// Shuffle table for converting RGBA to ARGB. -static const uvec8 kShuffleMaskRGBAToARGB = { - 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u -}; - -// Shuffle table for converting ARGB to RGBA. -static const uvec8 kShuffleMaskARGBToRGBA = { - 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u -}; - // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u @@ -230,112 +210,6 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, } } -__declspec(naked) __declspec(align(16)) -void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { - __asm { - mov eax, [esp + 4] // src_bgra - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskBGRAToARGB - sub edx, eax - - align 16 - convertloop: - movdqa xmm0, [eax] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToARGBRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_abgr - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskABGRToARGB - sub edx, eax - - align 16 - convertloop: - movdqu xmm0, [eax] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { - __asm { - mov eax, [esp + 4] // src_abgr - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskABGRToARGB - sub edx, eax - - align 16 - convertloop: - movdqa xmm0, [eax] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { - __asm { - mov eax, [esp + 4] // src_rgba - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskRGBAToARGB - sub edx, eax - - align 16 - convertloop: - movdqa xmm0, [eax] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - ret - } -} - -__declspec(naked) __declspec(align(16)) -void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_rgba - mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskARGBToRGBA - sub edx, eax - - align 16 - convertloop: - movdqa xmm0, [eax] - pshufb xmm0, xmm5 - sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - ret - } -} - __declspec(naked) __declspec(align(16)) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __asm { @@ -5635,8 +5509,8 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, #endif // HAS_HALFROW_AVX2 __declspec(naked) __declspec(align(16)) -void ARGBToBayerRow_SSSE3(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_bayer @@ -5657,6 +5531,88 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, } } +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 16 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + mov ecx, [esp + 12] // shuffler + movdqa xmm5, [ecx] + mov ecx, [esp + 16] // pix + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pshufb xmm0, xmm5 + pshufb xmm1, xmm5 + sub ecx, 8 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + ret + } +} + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + mov ecx, [esp + 12] // shuffler + vmovdqa xmm5, [ecx] + vpermq ymm5, ymm5, 0x44 // same shuffle in high as low. + mov ecx, [esp + 16] // pix + + align 16 + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpshufb ymm0, ymm0, ymm5 + vpshufb ymm1, ymm1, ymm5 + sub ecx, 16 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + jg wloop + ret + } +} +#endif + // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....