From bb5a009d11cb4872fe5b4a95a8fa673c896b7eb5 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 7 Apr 2015 23:52:57 +0000 Subject: [PATCH] ARGB4444ToARGB and ARGB1555ToARGB ported to AVX2. BUG=421 TESTED=out\release\libyuv_unittest --gtest_filter=*ARGB4444ToARGB* R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/48009004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1363 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 30 +++++++++----- include/libyuv/version.h | 2 +- source/convert.cc | 16 ++++++++ source/convert_argb.cc | 16 ++++++++ source/row_any.cc | 8 ++++ source/row_win.cc | 86 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 148 insertions(+), 12 deletions(-) diff --git a/README.chromium b/README.chromium index e84126aa0..eedd7a239 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1362 +Version: 1363 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 40ea57c0b..4d06eb94a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -65,6 +65,7 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 @@ -83,7 +84,7 @@ extern "C" { #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 -#define HAS_J400TOARGBROW_SSE2 +#define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 @@ -97,6 +98,8 @@ extern "C" { #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 +#define HAS_J400TOARGBROW_SSE2 +#define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 @@ -113,20 +116,17 @@ extern "C" { #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 -#define HAS_SETROW_X86 #define HAS_SETROW_ERMS -#define HAS_ARGBSETROW_X86 +#define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 -#define HAS_I400TOARGBROW_SSE2 #define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_J422TOARGBROW_SSSE3 // Effects: #define HAS_ARGBADDROW_SSE2 @@ -186,10 +186,12 @@ extern "C" { // The following are available require VS2012. Port to GCC. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 +// TODO(fbarchard): Fix GCC AVX2 versions of YUV conversion. bug=393 #define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 #define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 @@ -208,7 +210,6 @@ extern "C" { #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB565ROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 -// TODO(fbarchard): Port to Neon #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_AVX2 #endif @@ -882,6 +883,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); @@ -897,15 +902,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); + void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); -void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, - int pix); - void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 7675e104f..5129b5732 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1362 +#define LIBYUV_VERSION 1363 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index e444d574a..a38b93676 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1193,6 +1193,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; @@ -1308,6 +1316,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 40abdf96e..44756bc41 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -634,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; @@ -684,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGB4444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 3fc1ade6a..c0011cd56 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -226,6 +226,14 @@ RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, RGBANY(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, RGB565ToARGBRow_C, 2, 4, 15) #endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) +RGBANY(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, ARGB1555ToARGBRow_C, + 2, 4, 15) +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) +RGBANY(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, ARGB4444ToARGBRow_C, + 2, 4, 15) +#endif #if defined(HAS_YUY2TOARGBROW_AVX2) RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31) RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31) diff --git a/source/row_win.cc b/source/row_win.cc index 2d09a3f77..3bfa97431 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -573,6 +573,92 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, } #endif // HAS_RGB565TOARGBROW_AVX2 +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpand ymm1, ymm1, ymm3 + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpand ymm2, ymm2, ymm7 + vpor ymm0, ymm0, ymm2 // AG + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB1555TOARGBROW_AVX2 + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles + vpsrlw ymm3, ymm2, 4 + vpsllw ymm1, ymm0, 4 + vpor ymm2, ymm2, ymm3 + vpor ymm0, ymm0, ymm1 + vpunpckhbw ymm1, ymm0, ymm2 + vpunpcklbw ymm0, ymm0, ymm2 + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB4444TOARGBROW_AVX2 + // 24 instructions __declspec(naked) __declspec(align(16)) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,