From ccd6d9b2de6af7985775a2e5537190cf5794dd44 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 13 Jan 2012 19:26:50 +0000 Subject: [PATCH] ARGB1555ToARGBRow_SSE2 BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/349006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@133 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- source/row.h | 80 ++++------------- source/row_common.cc | 9 ++ source/row_win.cc | 204 +++++++++++++++++++++++++++++++++++++++---- 4 files changed, 213 insertions(+), 82 deletions(-) diff --git a/README.chromium b/README.chromium index 24359ae46..7e1df26af 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 132 +Version: 133 License: BSD License File: LICENSE diff --git a/source/row.h b/source/row.h index 192ab5680..0cbd7f0a7 100644 --- a/source/row.h +++ b/source/row.h @@ -60,8 +60,9 @@ // The following are available on Windows platforms #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) -#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_RGB565TOARGBROW_SSE2 +#define HAS_ARGB1555TOARGBROW_SSE2 +#define HAS_ARGB4444TOARGBROW_SSE2 #endif // The following are available on Neon platforms @@ -82,64 +83,60 @@ namespace libyuv { extern "C" { #endif -#ifdef HAS_FASTCONVERTYUVTOARGBROW_NEON +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +typedef __declspec(align(16)) signed char vec8[16]; +typedef __declspec(align(16)) unsigned char uvec8[16]; +typedef __declspec(align(16)) signed short vec16[8]; +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +typedef signed char __attribute__((vector_size(16))) vec8; +typedef unsigned char __attribute__((vector_size(16))) uvec8; +typedef signed short __attribute__((vector_size(16))) vec16; +#endif + + void FastConvertYUVToARGBRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTOBGRAROW_NEON void FastConvertYUVToBGRARow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTOABGRROW_NEON void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTORGB565ROW_NEON void FastConvertYUVToRGB565Row_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTOARGB1555ROW_NEON void FastConvertYUVToARGB1555Row_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTOARGB4444ROW_NEON void FastConvertYUVToARGB4444Row_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTORGB24ROW_NEON void FastConvertYUVToRGB24Row_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTORAWROW_NEON void FastConvertYUVToRAWRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_ARGBTOYROW_SSSE3 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -149,11 +146,6 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#endif -#if defined(HAS_RGB24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3) -#define HASRGB24TOYROW_SSSE3 -#endif -#ifdef HASRGB24TOYROW_SSSE3 void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -171,16 +163,9 @@ void ARGB1555ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, void ARGB4444ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#endif -#ifdef HAS_REVERSE_ROW_SSSE3 void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); -#endif -#ifdef HAS_REVERSE_ROW_SSE2 void ReverseRow_SSE2(const uint8* src, uint8* dst, int width); -#endif -#ifdef HAS_REVERSE_ROW_NEON void ReverseRow_NEON(const uint8* src, uint8* dst, int width); -#endif void ReverseRow_C(const uint8* src, uint8* dst, int width); void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); @@ -209,20 +194,14 @@ void ARGB1555ToUVRow_C(const uint8* src_argb0, int src_stride_argb, void ARGB4444ToUVRow_C(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -#ifdef HAS_RGB24TOARGBROW_SSSE3 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix); void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix); void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); -// TODO(fbarchard): SSE2 555 -//void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); -#endif -#ifdef HAS_RGB565TOARGBROW_SSE2 +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); -#endif -#ifdef HAS_ARGB4444TOARGBROW_SSE2 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); -#endif + void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix); @@ -231,27 +210,9 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); -#ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -#endif void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -#if defined(_MSC_VER) -#define SIMD_ALIGNED(var) __declspec(align(16)) var -typedef __declspec(align(16)) signed char vec8[16]; -typedef __declspec(align(16)) unsigned char uvec8[16]; -typedef __declspec(align(16)) signed short vec16[8]; -#else // __GNUC__ -#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) -typedef signed char __attribute__((vector_size(16))) vec8; -typedef unsigned char __attribute__((vector_size(16))) uvec8; -typedef signed short __attribute__((vector_size(16))) vec16; -#endif - -extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]); -extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]); -extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]); - void FastConvertYUVToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -310,7 +271,6 @@ void FastConvertYToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -344,9 +304,7 @@ void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, void FastConvertYToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -400,15 +358,11 @@ void FastConvertYUVToRAWRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* rgb_buf, int width); -#endif -#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 void FastConvertYToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width); -#endif - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 6a02a4b8a..4c52bef43 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -328,7 +328,11 @@ void RGB565ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGB1555ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { SIMD_ALIGNED(uint8 row[kMaxStride]); +#ifdef HAS_ARGB1555TOARGBROW_SSE2 + ARGB1555ToARGBRow_SSE2(src_argb, row, pix); +#else ARGB1555ToARGBRow_C(src_argb, row, pix); +#endif ARGBToYRow_SSSE3(row, dst_y, pix); } @@ -378,8 +382,13 @@ void RGB565ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, void ARGB1555ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix) { SIMD_ALIGNED(uint8 row[kMaxStride * 2]); +#ifdef HAS_ARGB1555TOARGBROW_SSE2 + ARGB1555ToARGBRow_SSE2(src_argb, row, pix); + ARGB1555ToARGBRow_SSE2(src_argb + src_stride_argb, row + kMaxStride, pix); +#else ARGB1555ToARGBRow_C(src_argb, row, pix); ARGB1555ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); +#endif ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); } diff --git a/source/row_win.cc b/source/row_win.cc index e3325b959..27a9f593e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -229,53 +229,50 @@ __asm { } } +#ifdef SHIFT565 +// Below shift/mask code is efficient and works, but more instructions than +// pmul method // TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc +// 29 instructions __declspec(naked) -void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int pix) { +void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { __asm { mov eax, [esp + 4] // src_rgb565 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 for Alpha pslld xmm5, 24 - pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 + pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red psllw xmm4, 11 - pcmpeqb xmm6, xmm6 // generate mask 0x001f001f + pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue psrlw xmm6, 11 - pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc + pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc for Green psrlw xmm7, 10 psllw xmm7, 2 - convertloop: movdqa xmm0, [eax] // fetch 8 pixels of bgr565 lea eax, [eax + 16] - movdqa xmm1, xmm0 movdqa xmm2, xmm0 pand xmm1, xmm4 // R in upper 5 bits psrlw xmm2, 13 // R 3 bits psllw xmm2, 8 por xmm1, xmm2 - movdqa xmm2, xmm0 pand xmm2, xmm6 // mask B 5 bits movdqa xmm3, xmm2 psllw xmm2, 3 psrlw xmm3, 2 por xmm2, xmm3 - por xmm1, xmm2 // RB - psrlw xmm0, 3 // G in top 6 bits of lower byte pand xmm0, xmm7 // mask G 6 bits movdqa xmm2, xmm0 psrlw xmm2, 6 por xmm0, xmm2 - por xmm0, xmm5 // AG - movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 @@ -288,7 +285,177 @@ __asm { } } +// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc +// 33 instructions +__declspec(naked) +void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { +__asm { + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 for Alpha + psllw xmm5, 8 + pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red + psllw xmm4, 11 + pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue + psrlw xmm6, 11 + pcmpeqb xmm7, xmm7 // generate mask 0x00f800f8 for Green + psrlw xmm7, 11 + psllw xmm7, 3 + + convertloop: + movdqa xmm0, [eax] // fetch 8 pixels of bgr565 + lea eax, [eax + 16] + movdqa xmm1, xmm0 + psllw xmm1, 1 + movdqa xmm2, xmm0 + pand xmm1, xmm4 // R in upper 5 bits + psrlw xmm2, 13 // R 3 bits + psllw xmm2, 8 + por xmm1, xmm2 + movdqa xmm2, xmm0 + pand xmm2, xmm6 // mask B 5 bits + movdqa xmm3, xmm2 + psllw xmm2, 3 + psrlw xmm3, 2 + por xmm2, xmm3 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + psrlw xmm2, 2 // G in top 5 bits of lower byte + pand xmm2, xmm7 // mask G 5 bits + movdqa xmm3, xmm2 + psrlw xmm3, 5 + por xmm2, xmm3 + psraw xmm0, 8 // A + pand xmm0, xmm5 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [edx], xmm1 // store 4 pixels of ARGB + movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB + lea edx, [edx + 32] + sub ecx, 8 + ja convertloop + ret + } +} +#endif + +// pmul method to replicate bits +// Math to replicate bits +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +// 20 instructions +__declspec(naked) +void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { +__asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x20082008 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green + psllw xmm4, 10 + psrlw xmm4, 5 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + movdqa xmm0, [eax] // fetch 8 pixels of bgr565 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + pand xmm1, xmm3 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pmulhuw xmm1, xmm5 // * (256 + 8) + pmulhuw xmm2, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + pand xmm0, xmm4 // G in middle 6 bits + pmulhuw xmm0, xmm6 // << 5 * (256 + 4) + por xmm0, xmm7 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + ja convertloop + ret + } +} + +// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc +// 24 instructions +__declspec(naked) +void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { +__asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + movd xmm5, eax + pshufd xmm5, xmm5, 0 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + pshufd xmm6, xmm6, 0 + pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red + psllw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x03e003e0 for Green + psllw xmm4, 11 + psrlw xmm4, 6 + pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha + psllw xmm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + movdqa xmm0, [eax] // fetch 8 pixels of 1555 + movdqa xmm1, xmm0 + movdqa xmm2, xmm0 + psllw xmm1, 1 // R in upper 5 bits + psllw xmm2, 11 // B in upper 5 bits + pand xmm1, xmm3 + pmulhuw xmm2, xmm5 // * (256 + 8) + pmulhuw xmm1, xmm5 // * (256 + 8) + psllw xmm1, 8 + por xmm1, xmm2 // RB + movdqa xmm2, xmm0 + pand xmm0, xmm4 // G in middle 5 bits + psraw xmm2, 8 // A + pmulhuw xmm0, xmm6 // << 6 * (256 + 8) + pand xmm2, xmm7 + por xmm0, xmm2 // AG + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm0 + punpckhbw xmm2, xmm0 + movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB + lea eax, [eax + 16] + sub ecx, 8 + ja convertloop + ret + } +} + // TODO(fbarchard): Port ARGB4444ToARGBRow_SSE2 to gcc +// 18 instructions __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix) { @@ -301,10 +468,11 @@ __asm { mov eax, [esp + 4] // src_argb4444 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax convertloop: - movdqa xmm0, qword ptr [eax] // fetch 8 pixels of bgra4444 - lea eax, [eax + 16] + movdqa xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 pand xmm0, xmm4 // mask low nibbles pand xmm2, xmm5 // mask high nibbles @@ -317,9 +485,9 @@ __asm { movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - movdqa [edx], xmm0 // store 4 pixels of ARGB - movdqa [edx + 16], xmm1 // store next 4 pixels of ARGB - lea edx, [edx + 32] + movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB + movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB + lea eax, [eax + 16] sub ecx, 8 ja convertloop ret