From 28427a53e2596608d6c8d63570e4d302e41fa313 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 18 Sep 2015 11:20:58 -0700 Subject: [PATCH] I444ToABGR for android Reimplements I444ToARGB as a matrix function. new I444ToABGR as matrix functions with wrappers and any functions. Allows for future J444 and H444 versions. I444ToABGR user level function added. BUG=libyuv:490, libyuv:449 R=harryjin@google.com Review URL: https://codereview.chromium.org/1355733002 . --- README.chromium | 2 +- include/libyuv/convert_argb.h | 8 ++ include/libyuv/row.h | 103 +++++++++++++++++++++----- include/libyuv/version.h | 2 +- source/convert_argb.cc | 68 +++++++++++++++++ source/row_any.cc | 6 ++ source/row_common.cc | 54 ++++++++++++++ source/row_win.cc | 135 +++++++++++++++++++++++++++------- unit_test/convert_test.cc | 1 + 9 files changed, 331 insertions(+), 48 deletions(-) diff --git a/README.chromium b/README.chromium index d4abd41e8..ad1b91ef0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1485 +Version: 1486 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 5463877e9..a161d3343 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -60,6 +60,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height); + // Convert I411 to ARGB. LIBYUV_API int I411ToARGB(const uint8* src_y, int src_stride_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b7e9d5c9b..ca3c3d01b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -56,6 +56,26 @@ extern "C" { #endif // clang >= 3.5 #endif // __clang__ +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) @@ -163,6 +183,7 @@ extern "C" { #endif // The following are available on x64 Visual C and clangcl. +// TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) #define HAS_I422TOARGBROW_SSSE3 @@ -171,27 +192,17 @@ extern "C" { #define HAS_I422TOABGRMATRIXROW_SSSE3 #endif -// GCC >= 4.7.0 required for AVX2. -#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) -#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) -#define GCC_HAS_AVX2 1 -#endif // GNUC >= 4.7 -#endif // __GNUC__ - -// clang >= 3.4.0 required for AVX2. -#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) -#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) -#define CLANG_HAS_AVX2 1 -#endif // clang >= 3.4 -#endif // __clang__ - -// Visual C 2012 required for AVX2. -#if defined(_M_IX86) && !defined(__clang__) && \ - defined(_MSC_VER) && _MSC_VER >= 1700 -#define VISUALC_HAS_AVX2 1 -#endif // VisualStudio >= 2012 - // The following are available for Visual C and clangcl 32 bit: +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#define HAS_I444TOABGRROW_SSSE3 +#define HAS_I444TOARGBMATRIXROW_SSSE3 +#define HAS_I444TOABGRMATRIXROW_SSSE3 +#endif + +// The following are available for AVX2 Visual C and clangcl 32 bit: +// TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_ARGB1555TOARGBROW_AVX2 @@ -206,12 +217,15 @@ extern "C" { #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TORGB565ROW_AVX2 #define HAS_I444TOARGBROW_AVX2 +#define HAS_I444TOABGRROW_AVX2 #define HAS_J400TOARGBROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB565ROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 +#define HAS_I444TOARGBMATRIXROW_AVX2 +#define HAS_I444TOABGRMATRIXROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -1030,6 +1044,11 @@ void I444ToARGBRow_C(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1166,6 +1185,18 @@ void I422ToABGRRow_AVX2(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToARGBMatrixRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + struct YuvConstants* YuvConstants, + int width); +void I444ToARGBMatrixRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + struct YuvConstants* YuvConstants, + int width); void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1176,6 +1207,28 @@ void I444ToARGBRow_AVX2(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToABGRMatrixRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + struct YuvConstants* YuvConstants, + int width); +void I444ToABGRMatrixRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + struct YuvConstants* YuvConstants, + int width); +void I444ToABGRRow_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I444ToABGRRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1382,6 +1435,16 @@ void I444ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToABGRRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); +void I444ToABGRRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, + int width); void I422ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4b10a1382..d9ce19bb0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1485 +#define LIBYUV_VERSION 1486 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index d282f598a..dec44dedd 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -112,6 +112,74 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + int y; + void (*I444ToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I444ToABGRRow_C; + if (!src_y || !src_u || !src_v || + !dst_abgr || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; + dst_stride_abgr = -dst_stride_abgr; + } + // Coalesce rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_abgr == width * 4) { + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; + } +#if defined(HAS_I444TOABGRROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I444ToABGRRow = I444ToABGRRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToABGRRow = I444ToABGRRow_SSSE3; + } + } +#endif +#if defined(HAS_I444TOABGRROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToABGRRow = I444ToABGRRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToABGRRow = I444ToABGRRow_AVX2; + } + } +#endif +#if defined(HAS_I444TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I444ToABGRRow = I444ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToABGRRow = I444ToABGRRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + I444ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + dst_abgr += dst_stride_abgr; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert I422 to ARGB. LIBYUV_API int I422ToARGB(const uint8* src_y, int src_stride_y, diff --git a/source/row_any.cc b/source/row_any.cc index 0e75547bc..c309499ca 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -62,6 +62,9 @@ ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7) ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_I444TOABGRROW_SSSE3 +ANY31(I444ToABGRRow_Any_SSSE3, I444ToABGRRow_SSSE3, 0, 0, 4, 7) +#endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) #endif @@ -95,6 +98,9 @@ ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15) #ifdef HAS_I444TOARGBROW_AVX2 ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif +#ifdef HAS_I444TOABGRROW_AVX2 +ANY31(I444ToABGRRow_Any_AVX2, I444ToABGRRow_AVX2, 0, 0, 4, 15) +#endif #ifdef HAS_I411TOARGBROW_AVX2 ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 5f4aa3714..13195684c 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1149,6 +1149,30 @@ void I444ToARGBRow_C(const uint8* src_y, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); } } + +void I444ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + uint8 u = (src_u[0] + src_u[1] + 1) >> 1; + uint8 v = (src_v[0] + src_v[1] + 1) >> 1; + YuvPixel(src_y[0], u, v, rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + YuvPixel(src_y[1], u, v, rgb_buf + 6, rgb_buf + 5, rgb_buf + 4); + rgb_buf[7] = 255; + src_y += 2; + src_u += 2; + src_v += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + } +} #else void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, @@ -1166,6 +1190,23 @@ void I444ToARGBRow_C(const uint8* src_y, rgb_buf += 4; // Advance 1 pixel. } } + +void I444ToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel(src_y[0], src_u[0], src_v[0], + rgb_buf + 2, rgb_buf + 1, rgb_buf + 0); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} #endif // Also used for 420 @@ -2319,6 +2360,19 @@ ANYYUV(I422ToABGRRow_AVX2, I422ToABGRMatrixRow_AVX2, kYuvConstants) ANYYUV(J422ToABGRRow_AVX2, I422ToABGRMatrixRow_AVX2, kYuvJConstants) ANYYUV(H422ToABGRRow_AVX2, I422ToABGRMatrixRow_AVX2, kYuvHConstants) #endif +// TODO(fbarchard): Neon, J444, H444 versions. +#ifdef HAS_I444TOARGBMATRIXROW_SSSE3 +ANYYUV(I444ToARGBRow_SSSE3, I444ToARGBMatrixRow_SSSE3, kYuvConstants) +#endif +#ifdef HAS_I444TOARGBMATRIXROW_AVX2 +ANYYUV(I444ToARGBRow_AVX2, I444ToARGBMatrixRow_AVX2, kYuvConstants) +#endif +#ifdef HAS_I444TOABGRMATRIXROW_SSSE3 +ANYYUV(I444ToABGRRow_SSSE3, I444ToABGRMatrixRow_SSSE3, kYuvConstants) +#endif +#ifdef HAS_I444TOABGRMATRIXROW_AVX2 +ANYYUV(I444ToABGRRow_AVX2, I444ToABGRMatrixRow_AVX2, kYuvConstants) +#endif // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 diff --git a/source/row_win.cc b/source/row_win.cc index 91aea8e94..62beb1c9b 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2172,41 +2172,83 @@ void I422ToARGBMatrixRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBMATRIXROW_AVX2 -#ifdef HAS_I444TOARGBROW_AVX2 +#ifdef HAS_I444TOARGBMATRIXROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) -void I444ToARGBRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { +void I444ToARGBMatrixRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + struct YuvConstants* YuvConstants, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebp + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebp, [esp + 12 + 20] // YuvConstants + mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha - convertloop: READYUV444_AVX2 - YUVTORGB_AVX2(kYuvConstants) + YUVTORGB_AVX2(ebp) STOREARGB_AVX2 sub ecx, 16 jg convertloop + pop ebp pop edi pop esi vzeroupper ret } } -#endif // HAS_I444TOARGBROW_AVX2 +#endif // HAS_I444TOARGBMATRIXROW_AVX2 + +#ifdef HAS_I444TOABGRMATRIXROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ABGR (64 bytes). +__declspec(naked) +void I444ToABGRMatrixRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + struct YuvConstants* YuvConstants, + int width) { + __asm { + push esi + push edi + push ebp + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebp, [esp + 12 + 20] // YuvConstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + convertloop: + READYUV444_AVX2 + YUVTORGB_AVX2(ebp) + STOREABGR_AVX2 + + sub ecx, 16 + jg convertloop + + pop ebp + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444TOABGRMATRIXROW_AVX2 #ifdef HAS_I411TOARGBROW_AVX2 // 16 pixels @@ -2608,30 +2650,71 @@ void I422ToABGRMatrixRow_AVX2(const uint8* y_buf, // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) -void I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* dst_argb, - int width) { +void I444ToARGBMatrixRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + struct YuvConstants* YuvConstants, + int width) { __asm { push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + push ebp + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ebp, [esp + 12 + 20] // YuvConstants + mov ecx, [esp + 12 + 24] // width sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 - YUVTORGB(kYuvConstants) + YUVTORGB(ebp) STOREARGB sub ecx, 8 jg convertloop + pop ebp + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). +__declspec(naked) +void I444ToABGRMatrixRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_abgr, + struct YuvConstants* YuvConstants, + int width) { + __asm { + push esi + push edi + push ebp + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // abgr + mov ebp, [esp + 12 + 20] // YuvConstants + mov ecx, [esp + 12 + 24] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV444 + YUVTORGB(ebp) + STOREABGR + + sub ecx, 8 + jg convertloop + + pop ebp pop edi pop esi ret diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index c91881f69..662c17224 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -513,6 +513,7 @@ TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4) TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4) TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)