From 2827277496dd9793863641600bd45708acfa49be Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 6 Apr 2015 19:24:23 +0000 Subject: [PATCH] port RGB565ToARGB to AVX2. BUG=421 TESTED=out\release\libyuv_unittest --gtest_filter=*RGB565ToARGB* R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/49609004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1357 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 32 ++++++++++++++-------- include/libyuv/version.h | 2 +- source/convert.cc | 8 ++++++ source/convert_argb.cc | 8 ++++++ source/row_any.cc | 4 +++ source/row_win.cc | 59 +++++++++++++++++++++++++++++++++++++++- 7 files changed, 100 insertions(+), 15 deletions(-) diff --git a/README.chromium b/README.chromium index b109b2010..10e3e02c5 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1356 +Version: 1357 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 560aeca80..40ea57c0b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -47,6 +47,9 @@ extern "C" { #define LIBYUV_SSSE3_ONLY #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // clang >= 3.5.0 required for Arm64. #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) @@ -184,26 +187,27 @@ extern "C" { // The following are available require VS2012. Port to GCC. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) // TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 -#define HAS_I422TOARGBROW_AVX2 -#define HAS_I422TOABGRROW_AVX2 -#define HAS_I422TOBGRAROW_AVX2 -#define HAS_I422TORGBAROW_AVX2 -#define HAS_NV12TOARGBROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_ARGBTORGB565ROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 -#define HAS_NV12TORGB565ROW_AVX2 -#define HAS_NV21TORGB565ROW_AVX2 -#define HAS_I422TORGB565ROW_AVX2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_I411TOARGBROW_AVX2 +#define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 -#define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TOBGRAROW_AVX2 #define HAS_I422TORAWROW_AVX2 +#define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 #define HAS_I444TOARGBROW_AVX2 -#define HAS_I411TOARGBROW_AVX2 #define HAS_J400TOARGBROW_AVX2 #define HAS_J422TOARGBROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 // TODO(fbarchard): Port to Neon #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_AVX2 @@ -877,6 +881,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); @@ -894,6 +899,9 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix); + void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cdc740472..c4717b8d4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1356 +#define LIBYUV_VERSION 1357 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 41696c18f..3622d8509 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1064,6 +1064,14 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 21969b9b8..40abdf96e 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -576,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_RGB565TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index b57a01a8e..3fc1ade6a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -222,6 +222,10 @@ RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, 2, 4, 7) #endif +#if defined(HAS_RGB565TOARGBROW_AVX2) +RGBANY(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, RGB565ToARGBRow_C, + 2, 4, 15) +#endif #if defined(HAS_YUY2TOARGBROW_AVX2) RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31) RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31) diff --git a/source/row_win.cc b/source/row_win.cc index 85c4dd20f..e34bdf70c 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -516,6 +516,63 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, } } +#ifdef HAS_RGB565TOARGBROW_AVX2 +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +__declspec(naked) __declspec(align(16)) +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpsllw ymm4, ymm4, 10 + vpsrlw ymm4, ymm4, 5 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + ret + vzeroupper + } +} +#endif HAS_RGB565TOARGBROW_AVX2 + // 24 instructions __declspec(naked) __declspec(align(16)) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, @@ -2856,7 +2913,7 @@ void I400ToARGBRow_AVX2(const uint8* y_buf, mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates