From cc89e3a77be30a27f7c56ee32860e5bbc9a00cc2 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 7 Oct 2015 18:24:50 -0700 Subject: [PATCH] port ARGB to 565 dithering SSE2 code to GCC. Previously the assembly code was only available to Windows. This CL ports the SSE2 code to GCC syntax. When running a profiler on all the unittests, this function was the slowest of all functions that still ran in C code. 3.71% libyuv_unittest libyuv_unittest [.] ARGBToRGB565DitherRow_C Was ARGBToRGB565Dither_Opt (2894 ms) Now ARGBToRGB565Dither_Opt (432 ms) TBR=harryjin@google.com BUG=libyuv:492 Review URL: https://codereview.chromium.org/1397673002 . --- include/libyuv/row.h | 67 ++++++++++++++++++++------------------------ source/row_gcc.cc | 46 ++++++++++++++++++++++++++++++ source/row_win.cc | 2 -- 3 files changed, 76 insertions(+), 39 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8695d894d..5352209b3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -91,6 +91,7 @@ extern "C" { #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 @@ -102,8 +103,12 @@ extern "C" { #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 +#define HAS_H422TOABGRROW_SSSE3 +#define HAS_H422TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422ALPHATOABGRROW_SSSE3 +#define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -115,11 +120,11 @@ extern "C" { #define HAS_I422TORGBAROW_SSSE3 #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I444TOABGRROW_SSSE3 +#define HAS_I444TOARGBROW_SSSE3 #define HAS_J400TOARGBROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 #define HAS_J422TOABGRROW_SSSE3 -#define HAS_H422TOARGBROW_SSSE3 -#define HAS_H422TOABGRROW_SSSE3 +#define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_UV_SSSE3 @@ -145,10 +150,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_I444TOARGBROW_SSSE3 -#define HAS_I444TOABGRROW_SSSE3 -#define HAS_I422ALPHATOARGBROW_SSSE3 -#define HAS_I422ALPHATOABGRROW_SSSE3 // Effects: #define HAS_ARGBADDROW_SSE2 @@ -184,10 +185,10 @@ extern "C" { // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) -#define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TOABGRROW_SSSE3 -#define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422ALPHATOABGRROW_SSSE3 +#define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 #endif // The following are available for AVX2 Visual C and clangcl 32 bit: @@ -199,17 +200,16 @@ extern "C" { #define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 -#define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_AVX2 #define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TORGB565ROW_AVX2 -#define HAS_I444TOARGBROW_AVX2 #define HAS_I444TOABGRROW_AVX2 +#define HAS_I444TOARGBROW_AVX2 #define HAS_J400TOARGBROW_AVX2 -#define HAS_RGB565TOARGBROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -226,7 +226,11 @@ extern "C" { #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX +#define HAS_H422TOABGRROW_AVX2 +#define HAS_H422TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 +#define HAS_I422ALPHATOABGRROW_AVX2 +#define HAS_I422ALPHATOARGBROW_AVX2 #define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGBROW_AVX2 #define HAS_I422TOBGRAROW_AVX2 @@ -234,12 +238,12 @@ extern "C" { #define HAS_I422TORGB24ROW_AVX2 #define HAS_I422TORGBAROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 -#define HAS_J422TOARGBROW_AVX2 #define HAS_J422TOABGRROW_AVX2 -#define HAS_H422TOARGBROW_AVX2 -#define HAS_H422TOABGRROW_AVX2 +#define HAS_J422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 @@ -249,10 +253,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -#define HAS_NV12TOARGBROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_I422ALPHATOARGBROW_AVX2 -#define HAS_I422ALPHATOABGRROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 @@ -273,10 +273,12 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV411ROW_NEON #define HAS_ARGBTOUV422ROW_NEON @@ -288,19 +290,12 @@ extern "C" { #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON -#define HAS_J400TOARGBROW_NEON +#define HAS_I400TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON -#define HAS_I422TOARGBROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON -// TODO(fbarchard): Implement aarch64 neon version -#ifndef __aarch64__ -#define HAS_J422TOARGBROW_NEON -#define HAS_J422TOABGRROW_NEON -#define HAS_H422TOARGBROW_NEON -#define HAS_H422TOABGRROW_NEON -#endif +#define HAS_I422TOARGBROW_NEON #define HAS_I422TOBGRAROW_NEON #define HAS_I422TORAWROW_NEON #define HAS_I422TORGB24ROW_NEON @@ -309,6 +304,7 @@ extern "C" { #define HAS_I422TOUYVYROW_NEON #define HAS_I422TOYUY2ROW_NEON #define HAS_I444TOARGBROW_NEON +#define HAS_J400TOARGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON @@ -327,29 +323,28 @@ extern "C" { #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON -#define HAS_ARGBSETROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON -#define HAS_I400TOARGBROW_NEON #define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -#define HAS_ARGBTORGB565DITHERROW_NEON // Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON #define HAS_INTERPOLATEROW_NEON #define HAS_SOBELROW_NEON @@ -357,8 +352,6 @@ extern "C" { #define HAS_SOBELXROW_NEON #define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON -#define HAS_ARGBCOLORMATRIXROW_NEON -#define HAS_ARGBSHUFFLEROW_NEON #endif // The following are available on Mips platforms: @@ -457,9 +450,9 @@ struct YuvConstants { #define KYTORGB 192 #endif -extern struct YuvConstants kYuvConstants; -extern struct YuvConstants kYuvJConstants; -extern struct YuvConstants kYuvHConstants; +extern struct YuvConstants kYuvConstants; // BT.601 +extern struct YuvConstants kYuvJConstants; // JPeg color space +extern struct YuvConstants kYuvHConstants; // BT.709 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 3ce0f0a49..e4084e41c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -526,6 +526,52 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { ); } +void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst, + const uint32 dither4, int pix) { + asm volatile ( + "movd %3,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm6 \n" + "movdqa %%xmm6,%%xmm7 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "punpckhwd %%xmm7,%%xmm7 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "psrld $0x1b,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrld $0x1a,%%xmm4 \n" + "pslld $0x5,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0xb,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "paddusb %%xmm6,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pslld $0x8,%%xmm0 \n" + "psrld $0x3,%%xmm1 \n" + "psrld $0x5,%%xmm2 \n" + "psrad $0x10,%%xmm0 \n" + "pand %%xmm3,%%xmm1 \n" + "pand %%xmm4,%%xmm2 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm2,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "packssdw %%xmm0,%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(pix) // %2 + : "m"(dither4) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" diff --git a/source/row_win.cc b/source/row_win.cc index d0e691cdb..0a00fd1fc 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -833,7 +833,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -// 4 pixels __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { @@ -871,7 +870,6 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -// 8 pixels __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int pix) {