From 212a1a5000dbdd6bcff7ec355fa2bfa6a00183f8 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Sat, 5 Oct 2013 04:17:50 +0000 Subject: [PATCH] ARGBShuffle_SSE2 for lower end CPUs BUG=271 TESTED=out\release\libyuv_unittest --gtest_filter=**R*ToARGB* R=johannkoenig@google.com, ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2361004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@807 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 6 ++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 8 +++ source/row_any.cc | 4 ++ source/row_win.cc | 126 ++++++++++++++++++++++++++++++++++++- 6 files changed, 143 insertions(+), 5 deletions(-) diff --git a/README.chromium b/README.chromium index 1145a9abc..3ccd4ff58 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 806 +Version: 807 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d2a23410b..48d2ab025 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -163,6 +163,8 @@ extern "C" { // The following are Windows only: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#define HAS_ARGBSHUFFLEROW_SSE2 + // Effects: // TODO(fbarchard): Optimize and enable // #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 @@ -709,6 +711,8 @@ void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, @@ -717,6 +721,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e6e0a629b..8f9e8d5db 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 806 +#define LIBYUV_VERSION 807 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index ee0487a73..b8b8be425 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1815,6 +1815,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, const uint8* shuffler, int pix) = ARGBShuffleRow_C; +#if defined(HAS_ARGBSHUFFLEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_SSE2; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index a7e371fc1..d04673574 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -474,6 +474,10 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, dst_argb + n * BPP, shuffler, width & MASK); \ } +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, + ARGBShuffleRow_C, 4, 4, 3) +#endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, ARGBShuffleRow_C, 4, 4, 7) diff --git a/source/row_win.cc b/source/row_win.cc index 840eb7531..b3c4c4735 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6552,7 +6552,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqa xmm5, [ecx] mov ecx, [esp + 16] // pix @@ -6578,7 +6578,7 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqa xmm5, [ecx] mov ecx, [esp + 16] // pix @@ -6605,7 +6605,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // pix @@ -6629,6 +6629,126 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + push ebx + push esi + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // pix + pxor xmm7, xmm7 + + mov ebx, [esi] // shuffler + cmp ebx, 0x03000102 + je shuf_3012 + cmp ebx, 0x00010203 + je shuf_0123 + cmp ebx, 0x00030201 + je shuf_0321 + cmp ebx, 0x02010003 + je shuf_2103 + + // TODO(fbarchard): Use one source pointer and 3 offsets. + shuf_any1: + movzx ebx, byte ptr [esi] + movzx ebx, byte ptr [eax + ebx] + mov [edx], bl + movzx ebx, byte ptr [esi + 1] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 1], bl + movzx ebx, byte ptr [esi + 2] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 2], bl + movzx ebx, byte ptr [esi + 3] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 3], bl + lea eax, [eax + 4] + lea edx, [edx + 4] + sub ecx, 1 + jg shuf_any1 + jmp shuf99 + + align 16 + shuf_0123: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshuflw xmm0, xmm0, 01Bh + pshufhw xmm1, xmm1, 01Bh + pshuflw xmm1, xmm1, 01Bh + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0123 + jmp shuf99 + + align 16 + shuf_0321: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshuflw xmm0, xmm0, 039h + pshufhw xmm1, xmm1, 039h + pshuflw xmm1, xmm1, 039h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0321 + jmp shuf99 + + align 16 + shuf_2103: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshuflw xmm0, xmm0, 093h + pshufhw xmm1, xmm1, 093h + pshuflw xmm1, xmm1, 093h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_2103 + jmp shuf99 + + align 16 + shuf_3012: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm7 + punpckhbw xmm1, xmm7 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshuflw xmm0, xmm0, 0C6h + pshufhw xmm1, xmm1, 0C6h + pshuflw xmm1, xmm1, 0C6h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_3012 + + shuf99: + pop esi + pop ebx + ret + } +} + // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....