From ed96b7b2c7169ba264209ac9fe3a1a49ebcac814 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 26 Jan 2018 16:05:01 -0800 Subject: [PATCH] AVX2 port of H010ToAR30_AVX2 Was SSSE3 H010ToAR30_Opt (635 ms) Now AVX2 H010ToAR30_Opt (448 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: I17b1a0e3268c4a9836e09683dd3377fb1ce60932 Reviewed-on: https://chromium-review.googlesource.com/889906 Commit-Queue: Frank Barchard Reviewed-by: Miguel Casas --- README.chromium | 2 +- include/libyuv/row.h | 13 +++++++++++++ include/libyuv/version.h | 2 +- source/row_any.cc | 3 +++ source/row_gcc.cc | 40 +++++++++++++++++++++++++++++++++++++++ unit_test/convert_test.cc | 1 + 6 files changed, 59 insertions(+), 2 deletions(-) diff --git a/README.chromium b/README.chromium index 6294ca9dd..7870f0890 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1693 +Version: 1694 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9c8908e59..c70928e1e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -272,6 +272,7 @@ extern "C" { #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I210TOAR30ROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 @@ -1918,6 +1919,12 @@ void I210ToARGBRow_AVX2(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2092,6 +2099,12 @@ void I210ToARGBRow_Any_AVX2(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_Any_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6b40f653a..4e261c561 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1693 +#define LIBYUV_VERSION 1694 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_any.cc b/source/row_any.cc index f6bdfaa0f..3f7c4a7b7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -230,6 +230,9 @@ ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) #ifdef HAS_I210TOARGBROW_AVX2 ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_I210TOAR30ROW_AVX2 +ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif #undef ANY31CT // Any 2 planes to 1. diff --git a/source/row_gcc.cc b/source/row_gcc.cc index d817556f5..e430ec77a 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2501,6 +2501,46 @@ void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOARGBROW_AVX2 +#if defined(HAS_I210TOAR30ROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOAR30ROW_AVX2 + #if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index d33511b0f..c3d8d229d 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -2083,6 +2083,7 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2) TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2) +TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2) TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2) TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2) TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)