From b86dbf24d30308aafca025018f13fc106c36e55b Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 27 Oct 2015 14:17:21 -0700 Subject: [PATCH] refactor I420AlphaToABGR to use I420AlphaToARGB internally swap U and V and transpose conversion matrix, so I420AlphaToARGB and I420AlphaToABGR share low level code. Having less code with same performance allows more focused optimization for future ARM versions. R=harryjin@google.com TBR=harryjin@chromium.org BUG=libyuv:473,libyuv:516 Review URL: https://codereview.chromium.org/1422263002 . --- README.chromium | 2 +- include/libyuv/row.h | 38 --------- include/libyuv/version.h | 2 +- source/convert_argb.cc | 164 +++++++++++---------------------------- source/row_any.cc | 2 - source/row_common.cc | 28 ------- source/row_gcc.cc | 69 ---------------- source/row_win.cc | 102 ------------------------ 8 files changed, 49 insertions(+), 358 deletions(-) diff --git a/README.chromium b/README.chromium index 0236d1280..6c38ea296 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1525 +Version: 1526 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9ac2678bb..b0483c1c1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -107,7 +107,6 @@ extern "C" { #define HAS_H422TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 -#define HAS_I422ALPHATOABGRROW_SSSE3 #define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 @@ -199,7 +198,6 @@ extern "C" { #define HAS_H422TOABGRROW_AVX2 #define HAS_H422TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 -#define HAS_I422ALPHATOABGRROW_AVX2 #define HAS_I422ALPHATOARGBROW_AVX2 #define HAS_I422TOABGRROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -254,7 +252,6 @@ extern "C" { // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) -#define HAS_I422ALPHATOABGRROW_SSSE3 #define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 @@ -1053,13 +1050,6 @@ void I422AlphaToARGBRow_C(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToABGRRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToABGRRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1216,13 +1206,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1230,13 +1213,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToABGRRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1434,13 +1410,6 @@ void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToABGRRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width); void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1448,13 +1417,6 @@ void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); -void I422AlphaToABGRRow_Any_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width); void I411ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 342f05fdd..270bba627 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1525 +#define LIBYUV_VERSION 1526 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index ca3509223..af18e66ff 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -45,7 +45,6 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, } // Convert I444 to ARGB. -LIBYUV_API static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -129,6 +128,21 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, width, height); } +// Convert I444 to ABGR. +LIBYUV_API +int I444ToABGR(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_abgr, int dst_stride_abgr, + int width, int height) { + return I444ToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + dst_abgr, dst_stride_abgr, + &kYvuIConstants, // Use Yvu matrix + width, height); +} + // Convert J444 to ARGB. LIBYUV_API int J444ToARGB(const uint8* src_y, int src_stride_y, @@ -144,21 +158,6 @@ int J444ToARGB(const uint8* src_y, int src_stride_y, width, height); } -// Convert I444 to ABGR. -LIBYUV_API -int I444ToABGR(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { - return I444ToARGBMatrix(src_y, src_stride_y, - src_v, src_stride_v, - src_u, src_stride_u, - dst_abgr, dst_stride_abgr, - &kYvuIConstants, - width, height); -} - // Convert I422 to ARGB. LIBYUV_API int I422ToARGB(const uint8* src_y, int src_stride_y, @@ -307,13 +306,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, } // Convert I420 with Alpha to preattenuated ARGB. -LIBYUV_API -int I420AlphaToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - const uint8* src_a, int src_stride_a, - uint8* dst_argb, int dst_stride_argb, - int width, int height, int attenuate) { +static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_argb, int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, int height, int attenuate) { int y; void (*I422AlphaToARGBRow)(const uint8* y_buf, const uint8* u_buf, @@ -393,7 +392,7 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvIConstants, + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); @@ -409,6 +408,23 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert I420 with Alpha to preattenuated ARGB. +LIBYUV_API +int I420AlphaToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + const uint8* src_a, int src_stride_a, + uint8* dst_argb, int dst_stride_argb, + int width, int height, int attenuate) { + return I420AlphaToARGBMatrix(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_a, src_stride_a, + dst_argb, dst_stride_argb, + &kYuvIConstants, + width, height, attenuate); +} + // Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API int I420AlphaToABGR(const uint8* src_y, int src_stride_y, @@ -417,99 +433,13 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, const uint8* src_a, int src_stride_a, uint8* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate) { - int y; - void (*I422AlphaToABGRRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) = I422AlphaToABGRRow_C; - void (*ARGBAttenuateRow)(const uint8* src_abgr, uint8* dst_abgr, - int width) = ARGBAttenuateRow_C; - if (!src_y || !src_u || !src_v || !dst_abgr || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; - dst_stride_abgr = -dst_stride_abgr; - } -#if defined(HAS_I422ALPHATOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_SSSE3; - } - } -#endif -#if defined(HAS_I422ALPHATOABGRROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_AVX2; - } - } -#endif -#if defined(HAS_I422ALPHATOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_NEON; - } - } -#endif -#if defined(HAS_I422ALPHATOABGRROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && - IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && - IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) { - I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2; - } -#endif -#if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBATTENUATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBAttenuateRow = ARGBAttenuateRow_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvIConstants, - width); - if (attenuate) { - ARGBAttenuateRow(dst_abgr, dst_abgr, width); - } - dst_abgr += dst_stride_abgr; - src_a += src_stride_a; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - return 0; + return I420AlphaToARGBMatrix(src_y, src_stride_y, + src_v, src_stride_v, // Swap U and V + src_u, src_stride_u, + src_a, src_stride_a, + dst_abgr, dst_stride_abgr, + &kYvuIConstants, // Use Yvu matrix + width, height, attenuate); } // Convert I400 to ARGB. diff --git a/source/row_any.cc b/source/row_any.cc index 764964cd1..859381fb7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -46,11 +46,9 @@ extern "C" { #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) -ANY41C(I422AlphaToABGRRow_Any_SSSE3, I422AlphaToABGRRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 7) -ANY41C(I422AlphaToABGRRow_Any_AVX2, I422AlphaToABGRRow_AVX2, 1, 0, 4, 7) #endif #undef ANY41C diff --git a/source/row_common.cc b/source/row_common.cc index 66c0fccb0..87862bff5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1413,34 +1413,6 @@ void I422ToABGRRow_C(const uint8* src_y, } } -void I422AlphaToABGRRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { - int x; - for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants); - rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], - rgb_buf + 6, rgb_buf + 5, rgb_buf + 4, yuvconstants); - rgb_buf[7] = src_a[1]; - src_y += 2; - src_u += 1; - src_v += 1; - src_a += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], - rgb_buf + 2, rgb_buf + 1, rgb_buf + 0, yuvconstants); - rgb_buf[3] = src_a[0]; - } -} - void I422ToRGB24Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index d6976dd98..9c2123044 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1766,38 +1766,6 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, ); } -void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422 - YUVTORGB(yuvconstants) - STOREABGR - "subl $0x8,%[width] \n" - "jg 1b \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} - void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2229,43 +2197,6 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422ALPHATOARGBROW_AVX2 -#if defined(HAS_I422ALPHATOABGRROW_AVX2) -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. -void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) { - asm volatile ( - "sub %[u_buf],%[v_buf] \n" - LABELALIGN - "1: \n" - READYUVA422_AVX2 - YUVTORGB_AVX2(yuvconstants) - STOREABGR_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" - : [y_buf]"+r"(y_buf), // %[y_buf] - [u_buf]"+r"(u_buf), // %[u_buf] - [v_buf]"+r"(v_buf), // %[v_buf] - [a_buf]"+r"(a_buf), // %[a_buf] - [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] -#if defined(__i386__) && defined(__pic__) - [width]"+m"(width) // %[width] -#else - [width]"+rm"(width) // %[width] -#endif - : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_I422ALPHATOABGRROW_AVX2 - #if defined(HAS_I422TOABGRROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). diff --git a/source/row_win.cc b/source/row_win.cc index 4e2fee1b0..499f75daf 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -153,25 +153,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } #endif -#if defined(HAS_I422ALPHATOABGRROW_SSSE3) -void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) { - __m128i xmm0, xmm1, xmm2, xmm4, xmm5; - const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; - while (width > 0) { - READYUVA422 - YUVTORGB(yuvconstants) - STOREABGR - width -= 8; - } -} -#endif - // 32 bit #else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 @@ -2185,49 +2166,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422ALPHATOARGBROW_AVX2 -#ifdef HAS_I422ALPHATOABGRROW_AVX2 -// 16 pixels -// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR. -__declspec(naked) -void I422AlphaToABGRRow_AVX2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // abgr - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422_AVX2 - YUVTORGB_AVX2(ebx) - STOREABGR_AVX2 - - sub ecx, 16 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - vzeroupper - ret - } -} -#endif // HAS_I422ALPHATOABGRROW_AVX2 - #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). @@ -3027,46 +2965,6 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels. -// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR. -__declspec(naked) -void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - const uint8* a_buf, - uint8* dst_abgr, - const struct YuvConstants* yuvconstants, - int width) { - __asm { - push esi - push edi - push ebx - push ebp - mov eax, [esp + 16 + 4] // Y - mov esi, [esp + 16 + 8] // U - mov edi, [esp + 16 + 12] // V - mov ebp, [esp + 16 + 16] // A - mov edx, [esp + 16 + 20] // abgr - mov ebx, [esp + 16 + 24] // yuvconstants - mov ecx, [esp + 16 + 28] // width - sub edi, esi - - convertloop: - READYUVA422 - YUVTORGB(ebx) - STOREABGR - - sub ecx, 8 - jg convertloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - // 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more.