port ARGB to 565 dithering SSE2 code to GCC.

Previously the assembly code was only available to Windows.
This CL ports the SSE2 code to GCC syntax.

When running a profiler on all the unittests, this function
was the slowest of all functions that still ran in C code.
   3.71%  libyuv_unittest  libyuv_unittest      [.] ARGBToRGB565DitherRow_C

Was
ARGBToRGB565Dither_Opt (2894 ms)
Now
ARGBToRGB565Dither_Opt (432 ms)

TBR=harryjin@google.com
BUG=libyuv:492

Review URL: https://codereview.chromium.org/1397673002 .
This commit is contained in:
Frank Barchard 2015-10-07 18:24:50 -07:00
parent 3e38762d6b
commit cc89e3a77b
3 changed files with 76 additions and 39 deletions

View File

@ -91,6 +91,7 @@ extern "C" {
#define HAS_ARGBTOARGB4444ROW_SSE2
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
@ -102,8 +103,12 @@ extern "C" {
#define HAS_BGRATOYROW_SSSE3
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_H422TOABGRROW_SSSE3
#define HAS_H422TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
@ -115,11 +120,11 @@ extern "C" {
#define HAS_I422TORGBAROW_SSSE3
#define HAS_I422TOUYVYROW_SSE2
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOABGRROW_SSSE3
#define HAS_I444TOARGBROW_SSSE3
#define HAS_J400TOARGBROW_SSE2
#define HAS_J422TOARGBROW_SSSE3
#define HAS_J422TOABGRROW_SSSE3
#define HAS_H422TOARGBROW_SSSE3
#define HAS_H422TOABGRROW_SSSE3
#define HAS_J422TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
#define HAS_MIRRORROW_SSSE3
#define HAS_MIRRORROW_UV_SSSE3
@ -145,10 +150,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
#define HAS_I444TOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
// Effects:
#define HAS_ARGBADDROW_SSE2
@ -184,10 +185,10 @@ extern "C" {
// The following are also available on x64 Visual C.
#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \
(!defined(__clang__) || defined(__SSSE3__))
#define HAS_I422TOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422ALPHATOABGRROW_SSSE3
#define HAS_I422ALPHATOARGBROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGBROW_SSSE3
#endif
// The following are available for AVX2 Visual C and clangcl 32 bit:
@ -199,17 +200,16 @@ extern "C" {
#define HAS_ARGBTOARGB1555ROW_AVX2
#define HAS_ARGBTOARGB4444ROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_AVX2
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_AVX2
#define HAS_I411TOARGBROW_AVX2
#define HAS_I422TOARGB1555ROW_AVX2
#define HAS_I422TOARGB4444ROW_AVX2
#define HAS_I422TORGB565ROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_I444TOABGRROW_AVX2
#define HAS_I444TOARGBROW_AVX2
#define HAS_J400TOARGBROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#define HAS_NV12TORGB565ROW_AVX2
#define HAS_RGB565TOARGBROW_AVX2
#endif
// The following are available on all x86 platforms, but
@ -226,7 +226,11 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_COPYROW_AVX
#define HAS_H422TOABGRROW_AVX2
#define HAS_H422TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422TOABGRROW_AVX2
#define HAS_I422TOARGBROW_AVX2
#define HAS_I422TOBGRAROW_AVX2
@ -234,12 +238,12 @@ extern "C" {
#define HAS_I422TORGB24ROW_AVX2
#define HAS_I422TORGBAROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_J422TOABGRROW_AVX2
#define HAS_H422TOARGBROW_AVX2
#define HAS_H422TOABGRROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_SPLITUVROW_AVX2
#define HAS_UYVYTOARGBROW_AVX2
#define HAS_UYVYTOUV422ROW_AVX2
@ -249,10 +253,6 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV21TOARGBROW_AVX2
#define HAS_I422ALPHATOARGBROW_AVX2
#define HAS_I422ALPHATOABGRROW_AVX2
// Effects:
#define HAS_ARGBADDROW_AVX2
@ -273,10 +273,12 @@ extern "C" {
#define HAS_ARGB4444TOARGBROW_NEON
#define HAS_ARGB4444TOUVROW_NEON
#define HAS_ARGB4444TOYROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_ARGBTOARGB1555ROW_NEON
#define HAS_ARGBTOARGB4444ROW_NEON
#define HAS_ARGBTORAWROW_NEON
#define HAS_ARGBTORGB24ROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
#define HAS_ARGBTOUV411ROW_NEON
#define HAS_ARGBTOUV422ROW_NEON
@ -288,19 +290,12 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_COPYROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_I411TOARGBROW_NEON
#define HAS_I422TOARGBROW_NEON
#define HAS_I422TOABGRROW_NEON
#define HAS_I422TOARGB1555ROW_NEON
#define HAS_I422TOARGB4444ROW_NEON
// TODO(fbarchard): Implement aarch64 neon version
#ifndef __aarch64__
#define HAS_J422TOARGBROW_NEON
#define HAS_J422TOABGRROW_NEON
#define HAS_H422TOARGBROW_NEON
#define HAS_H422TOABGRROW_NEON
#endif
#define HAS_I422TOARGBROW_NEON
#define HAS_I422TOBGRAROW_NEON
#define HAS_I422TORAWROW_NEON
#define HAS_I422TORGB24ROW_NEON
@ -309,6 +304,7 @@ extern "C" {
#define HAS_I422TOUYVYROW_NEON
#define HAS_I422TOYUY2ROW_NEON
#define HAS_I444TOARGBROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
@ -327,29 +323,28 @@ extern "C" {
#define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_I400TOARGBROW_NEON
#define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
#define HAS_ARGBTORGB565DITHERROW_NEON
// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBGRAYROW_NEON
#define HAS_ARGBMIRRORROW_NEON
#define HAS_ARGBMULTIPLYROW_NEON
#define HAS_ARGBQUANTIZEROW_NEON
#define HAS_ARGBSEPIAROW_NEON
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_INTERPOLATEROW_NEON
#define HAS_SOBELROW_NEON
@ -357,8 +352,6 @@ extern "C" {
#define HAS_SOBELXROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON
#define HAS_ARGBSHUFFLEROW_NEON
#endif
// The following are available on Mips platforms:
@ -457,9 +450,9 @@ struct YuvConstants {
#define KYTORGB 192
#endif
extern struct YuvConstants kYuvConstants;
extern struct YuvConstants kYuvJConstants;
extern struct YuvConstants kYuvHConstants;
extern struct YuvConstants kYuvConstants; // BT.601
extern struct YuvConstants kYuvJConstants; // JPeg color space
extern struct YuvConstants kYuvHConstants; // BT.709
#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
#define OMITFP

View File

@ -526,6 +526,52 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
);
}
void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
const uint32 dither4, int pix) {
asm volatile (
"movd %3,%%xmm6 \n"
"punpcklbw %%xmm6,%%xmm6 \n"
"movdqa %%xmm6,%%xmm7 \n"
"punpcklwd %%xmm6,%%xmm6 \n"
"punpckhwd %%xmm7,%%xmm7 \n"
"pcmpeqb %%xmm3,%%xmm3 \n"
"psrld $0x1b,%%xmm3 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrld $0x1a,%%xmm4 \n"
"pslld $0x5,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"paddusb %%xmm6,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
"psrld $0x3,%%xmm1 \n"
"psrld $0x5,%%xmm2 \n"
"psrad $0x10,%%xmm0 \n"
"pand %%xmm3,%%xmm1 \n"
"pand %%xmm4,%%xmm2 \n"
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"lea 0x10(%0),%0 \n"
"movq %%xmm0,(%1) \n"
"lea 0x8(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(pix) // %2
: "m"(dither4) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
);
}
void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
asm volatile (
"pcmpeqb %%xmm4,%%xmm4 \n"

View File

@ -833,7 +833,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
// 4 pixels
__declspec(naked)
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
__asm {
@ -871,7 +870,6 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
}
}
// 8 pixels
__declspec(naked)
void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int pix) {