From 446fa955877084b353cd7d7c1c8471013a683e91 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 24 Feb 2015 23:14:46 +0000 Subject: [PATCH] I422ToRGB565, ARGB4444 and ARGB1555 for AVX2 BUG=403 TESTED=avx2 emulator Review URL: https://webrtc-codereview.appspot.com/34359004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1293 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 52 +++++++++++++- include/libyuv/version.h | 2 +- source/convert_from.cc | 24 +++++++ source/planar_functions.cc | 16 +++++ source/row_any.cc | 14 ++++ source/row_common.cc | 135 +++++++++++++++++++++++++++++++++---- 7 files changed, 228 insertions(+), 17 deletions(-) diff --git a/README.chromium b/README.chromium index b9ad2682b..304b33c1c 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1292 +Version: 1293 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8de624c95..50ecb6dfe 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -84,7 +84,6 @@ extern "C" { #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 @@ -220,6 +219,11 @@ extern "C" { #if defined(HAS_I422TOARGBROW_AVX2) #define HAS_YUY2TOARGBROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 #endif // Effects: @@ -1047,6 +1051,14 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width); @@ -1084,16 +1096,31 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1155,6 +1182,14 @@ void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_argb, int width); +void NV12ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + int width); +void NV21ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + int width); void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width); @@ -1187,16 +1222,31 @@ void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToARGB4444Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToARGB1555Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToRGB565Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + int width); void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f3c0a5568..0881d1097 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1292 +#define LIBYUV_VERSION 1293 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from.cc b/source/convert_from.cc index cfe37bf97..b743cde26 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -843,6 +843,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB1555Row = I422ToARGB1555Row_AVX2; + } + } +#endif #if defined(HAS_I422TOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; @@ -896,6 +904,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOARGB4444ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGB4444Row = I422ToARGB4444Row_AVX2; + } + } +#endif #if defined(HAS_I422TOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; @@ -948,6 +964,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB565Row = I422ToRGB565Row_AVX2; + } + } +#endif #if defined(HAS_I422TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 21ece1a6f..75ef775dd 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1020,6 +1020,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV12ToRGB565Row = NV12ToRGB565Row_AVX2; + } + } +#endif #if defined(HAS_NV12TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; @@ -1069,6 +1077,14 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_NV21TORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + NV21ToRGB565Row = NV21ToRGB565Row_AVX2; + } + } +#endif #if defined(HAS_NV21TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 6c1a5b307..0ae2fdf04 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -73,6 +73,14 @@ YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15) #ifdef HAS_I422TOABGRROW_AVX2 YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15) #endif // HAS_I422TOABGRROW_AVX2 +#ifdef HAS_I422TOARGB4444ROW_AVX2 +YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C, + 1, 2, 7) +YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C, + 1, 2, 7) +YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C, + 1, 2, 7) +#endif #ifdef HAS_I422TOARGBROW_NEON YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) @@ -123,6 +131,12 @@ NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, 0, 2, 7) #endif // HAS_NV12TORGB565ROW_SSSE3 +#ifdef HAS_NV12TORGB565ROW_AVX2 +NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C, + 0, 2, 15) +NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C, + 0, 2, 15) +#endif // HAS_NV12TORGB565ROW_AVX2 #ifdef HAS_NV12TORGB565ROW_NEON NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2, 7) diff --git a/source/row_common.cc b/source/row_common.cc index 9cb11027d..e56c057b6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2125,9 +2125,8 @@ void I422ToUYVYRow_C(const uint8* src_y, // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 -#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) +#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. -#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -2145,9 +2144,9 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, width -= twidth; } } -#endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) +#endif -#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +#if defined(HAS_I422TOARGB1555ROW_SSSE3) void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -2166,7 +2165,9 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, width -= twidth; } } +#endif +#if defined(HAS_I422TOARGB4444ROW_SSSE3) void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -2185,7 +2186,9 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, width -= twidth; } } +#endif +#if defined(HAS_NV12TORGB565ROW_SSSE3) void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, int width) { // Row buffer for intermediate ARGB pixels. @@ -2200,7 +2203,9 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, width -= twidth; } } +#endif +#if defined(HAS_NV21TORGB565ROW_SSSE3) void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_rgb565, int width) { // Row buffer for intermediate ARGB pixels. @@ -2215,7 +2220,9 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, width -= twidth; } } +#endif +#if defined(HAS_YUY2TOARGBROW_SSSE3) void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { // Row buffers for intermediate YUV pixels. SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); @@ -2231,7 +2238,9 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { width -= twidth; } } +#endif +#if defined(HAS_UYVYTOARGBROW_SSSE3) void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { // Row buffers for intermediate YUV pixels. SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); @@ -2247,15 +2256,111 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { width -= twidth; } } -#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) +#if defined(HAS_I422TORGB565ROW_AVX2) && !defined(_MSC_VER) +// row_win.cc has asm version, but GCC uses 2 step wrapper. +void I422ToRGB565Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB1555ROW_AVX2) +void I422ToARGB1555Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb1555, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb1555 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TOARGB4444ROW_AVX2) +void I422ToARGB4444Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb4444, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb4444 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV12TORGB565ROW_AVX2) +void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + +#if defined(HAS_NV21TORGB565ROW_AVX2) +void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth); + ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } +} +#endif + #if defined(HAS_YUY2TOARGBROW_AVX2) void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV conversion. - SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth); @@ -2266,12 +2371,14 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) { width -= twidth; } } +#endif +#if defined(HAS_UYVYTOARGBROW_AVX2) void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) { - // Row buffers for intermediate YUV conversion. - SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]); + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth); @@ -2282,7 +2389,7 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) { width -= twidth; } } -#endif +#endif // !defined(LIBYUV_DISABLE_X86) void ARGBPolynomialRow_C(const uint8* src_argb, uint8* dst_argb, const float* poly,