From 9245317e1687744b50f653d631bd808a00314041 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 4 Mar 2015 00:00:50 +0000 Subject: [PATCH] ARGBToRGB565 SSE2 port. BUG=407 TESTED=ARGBToRGB565Dither unittest R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/41039004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1308 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 7 +++ include/libyuv/version.h | 2 +- source/convert_from_argb.cc | 30 +++++++----- source/row_any.cc | 16 +++++++ source/row_win.cc | 95 +++++++++++++++++++++++++++++++------ 6 files changed, 124 insertions(+), 28 deletions(-) diff --git a/README.chromium b/README.chromium index e16953bea..ccf25feea 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1306 +Version: 1307 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 80e844bae..8c6feda3a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -198,6 +198,7 @@ extern "C" { #define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_SSE2 #endif // The following are available on all x86 platforms, but @@ -905,6 +906,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix); + void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -1375,6 +1379,9 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix); + void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2c996980c..04624d0a1 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1306 +#define LIBYUV_VERSION 1307 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index dc2186a6a..ce5d97e1c 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -804,15 +804,16 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, return 0; } -static const uint8 kDither8x8[64] = { - 0, 128, 32, 160, 8, 136, 40, 168, - 192, 64, 224, 96, 200, 72, 232, 104, - 48, 176, 16, 144, 56, 184, 24, 152, - 240, 112, 208, 80, 248, 120, 216, 88, - 12, 140, 44, 172, 4, 132, 36, 164, - 204, 76, 236, 108, 196, 68, 228, 100, - 60, 188, 28, 156, 52, 180, 20, 148, - 252, 124, 220, 92, 244, 116, 212, 84, +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8 kDither565_8x8[64] = { + 0 >> 5, 128 >> 5, 32 >> 5, 160 >> 5, 8 >> 5, 136 >> 5, 40 >> 5, 168 >> 5, + 192 >> 5, 64 >> 5, 224 >> 5, 96 >> 5, 200 >> 5, 72 >> 5, 232 >> 5, 104 >> 5, + 48 >> 5, 176 >> 5, 16 >> 5, 144 >> 5, 56 >> 5, 184 >> 5, 24 >> 5, 152 >> 5, + 240 >> 5, 112 >> 5, 208 >> 5, 80 >> 5, 248 >> 5, 120 >> 5, 216 >> 5, 88 >> 5, + 12 >> 5, 140 >> 5, 44 >> 5, 172 >> 5, 4 >> 5, 132 >> 5, 36 >> 5, 164 >> 5, + 204 >> 5, 76 >> 5, 236 >> 5, 108 >> 5, 196 >> 5, 68 >> 5, 228 >> 5, 100 >> 5, + 60 >> 5, 188 >> 5, 28 >> 5, 156 >> 5, 52 >> 5, 180 >> 5, 20 >> 5, 148 >> 5, + 252 >> 5, 124 >> 5, 220 >> 5, 92 >> 5, 244 >> 5, 116 >> 5, 212 >> 5, 84 >> 5, }; // Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). @@ -832,9 +833,16 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } if (!dither8x8) { - dither8x8 = kDither8x8; - + dither8x8 = kDither565_8x8; } +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, dither8x8 + ((y & 7) << 3), width); diff --git a/source/row_any.cc b/source/row_any.cc index 19340b3b7..631b09a4c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -225,6 +225,22 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7) #endif #undef RGBANY +#define RGBDANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src, uint8* dst, \ + const uint8* dither8x8, int width) { \ + int n = width & ~MASK; \ + if (n > 0) { \ + ARGBTORGB_SIMD(src, dst, dither8x8, n); \ + } \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, dither8x8, width & MASK); \ + } + +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +RGBDANY(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, + ARGBToRGB565DitherRow_C, 4, 2, 7) +#endif +#undef RGBDANY + // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. #define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \ diff --git a/source/row_win.cc b/source/row_win.cc index 5c06b6078..e6796e407 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -585,6 +585,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } +// 4 pixels __declspec(naked) __declspec(align(16)) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { @@ -622,6 +623,70 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } +// 8 pixels +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8, int pix) { + __asm { + mov eax, [esp + 12] // dither8 + movq xmm6, qword ptr [eax] // fetch 8 dither values + punpcklbw xmm6, xmm6 + movdqa xmm7, xmm6 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 16] // pix + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + + movdqu xmm0, [eax + 16] // fetch 4 pixels of argb + paddusb xmm0, xmm7 + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 // store 4 pixels of RGB565 + + lea eax, [eax + 32] + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + ret + } +} + // TODO(fbarchard): Improve sign extension/packing. __declspec(naked) __declspec(align(16)) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { @@ -1646,8 +1711,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm2, ymm2, 0xd8 \ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ - __asm vmovdqu [edx], ymm1 \ - __asm vmovdqu [edx + 32], ymm0 \ + __asm vmovdqu 0[edx], ymm1 \ + __asm vmovdqu 32[edx], ymm0 \ __asm lea edx, [edx + 64] \ } @@ -1959,8 +2024,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ - __asm movdqu [edx], xmm0 \ - __asm movdqu [edx + 16], xmm1 \ + __asm movdqu 0[edx], xmm0 \ + __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32] \ } @@ -1973,8 +2038,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ - __asm movdqu [edx], xmm5 \ - __asm movdqu [edx + 16], xmm0 \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32] \ } @@ -1986,8 +2051,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm1, xmm2 \ __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ - __asm movdqu [edx], xmm2 \ - __asm movdqu [edx + 16], xmm1 \ + __asm movdqu 0[edx], xmm2 \ + __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32] \ } @@ -2000,8 +2065,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ - __asm movdqu [edx], xmm5 \ - __asm movdqu [edx + 16], xmm0 \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32] \ } @@ -2017,8 +2082,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24] \ } @@ -2034,8 +2099,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24] \ } @@ -2071,7 +2136,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm por xmm3, xmm2 /* BG */ \ __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm lea edx, [edx + 16] \ }