From 664c735677521d3ec216315ed599304f660f0f48 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 31 Jan 2018 15:40:08 -0800 Subject: [PATCH] I420ToYUY2_AVX2 port I420 and I422 To YUY2 and UYVY ported from SSE2 to AVX2. Was SSE2 I420ToYUY2_Opt (135 ms) I420ToUYVY_Opt (148 ms) I422ToYUY2_Opt (145 ms) I422ToUYVY_Opt (142 ms) Now AVX2 I420ToYUY2_Opt (133 ms) I420ToUYVY_Opt (130 ms) I422ToYUY2_Opt (127 ms) I422ToUYVY_Opt (137 ms) Bug: libyuv:556 Test: out/Release/libyuv_unittest --sandbox_unittests --gtest_filter=*I42?To*UY*Opt Change-Id: Ic35f97cee02dc009fd98785589ba17c7cf50bb35 Reviewed-on: https://chromium-review.googlesource.com/892493 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/row.h | 38 +++++++++++++++-- include/libyuv/version.h | 2 +- source/convert_from.cc | 32 +++++++++++++++ source/convert_from_argb.cc | 16 ++++++++ source/row_any.cc | 4 ++ source/row_gcc.cc | 82 +++++++++++++++++++++++++++++++++++++ source/scale_gcc.cc | 1 + 8 files changed, 171 insertions(+), 6 deletions(-) diff --git a/README.chromium b/README.chromium index 7870f0890..8370f8d30 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1694 +Version: 1695 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index caa45fb31..506c3e06f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -276,6 +276,8 @@ extern "C" { #define HAS_I210TOARGBROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 +#define HAS_I422TOUYVYROW_AVX2 +#define HAS_I422TOYUY2ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #endif @@ -2412,8 +2414,12 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_argb, void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); -void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr, + uint8_t* dst_ar30, + int width); +void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, + uint8_t* dst_ar30, + int width); void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, @@ -2433,8 +2439,12 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_argb, void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); -void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr, + uint8_t* dst_ar30, + int width); +void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, + uint8_t* dst_ar30, + int width); void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, @@ -2840,6 +2850,26 @@ void I422ToUYVYRow_Any_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width); +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); +void I422ToYUY2Row_Any_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_yuy2, + int width); +void I422ToUYVYRow_Any_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uyvy, + int width); void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4e261c561..ba3847c70 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1694 +#define LIBYUV_VERSION 1695 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_from.cc b/source/convert_from.cc index bc90d3c57..b5587ced6 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -219,6 +219,14 @@ int I422ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -270,6 +278,14 @@ int I420ToYUY2(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -341,6 +357,14 @@ int I422ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; @@ -400,6 +424,14 @@ int I420ToUYVY(const uint8_t* src_y, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 839bc333e..d0b4829f5 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -583,6 +583,14 @@ int ARGBToYUY2(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_AVX2; + } + } +#endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; @@ -712,6 +720,14 @@ int ARGBToUYVY(const uint8_t* src_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_AVX2; + } + } +#endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 31e5ea37d..9343992b1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -96,6 +96,10 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_AVX2 +ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) +ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 4451fa2ab..85ef1319c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -6041,6 +6041,88 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, } #endif // HAS_I422TOUYVYROW_SSE2 +#ifdef HAS_I422TOYUY2ROW_AVX2 +// TODO(fbarchard): Consider vmovhps to avoid vpermq + +void I422ToYUY2Row_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%1),%%xmm2 \n" + "vmovdqu 0x00(%1,%2,1),%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3) \n" + "vmovdqu %%ymm1,0x20(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif // HAS_I422TOYUY2ROW_AVX2 + +#ifdef HAS_I422TOUYVYROW_AVX2 +void I422ToUYVYRow_AVX2(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_frame, + int width) { + asm volatile( + + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%1),%%xmm2 \n" + "vmovdqu 0x00(%1,%2,1),%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "lea 0x20(%0),%0 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1,(%3) \n" + "vmovdqu %%ymm2,0x20(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); +} +#endif // HAS_I422TOUYVYROW_AVX2 + #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index b2b415940..312236d2d 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -1324,6 +1324,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "movd %%xmm0,(%0) \n" LABELALIGN "99: \n" // clang-format error. + : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+rm"(dst_width), // %2