From e365cdde3b18f8c109c9d6319dbfb5c493c242e6 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 25 Sep 2015 10:29:20 -0700 Subject: [PATCH] I420Alpha row function in 1 pass. API change - I420AlphaToARGB takes flag indicating if RGB should be premultiplied by alpha. This version implements an efficient SSSE3 version for Windows. C version done in 2 steps. Was libyuvTest.I420AlphaToARGB_Any (1136 ms) libyuvTest.I420AlphaToARGB_Unaligned (1210 ms) libyuvTest.I420AlphaToARGB_Invert (966 ms) libyuvTest.I420AlphaToARGB_Opt (1031 ms) libyuvTest.I420AlphaToABGR_Any (1020 ms) libyuvTest.I420AlphaToABGR_Unaligned (1359 ms) libyuvTest.I420AlphaToABGR_Invert (1082 ms) libyuvTest.I420AlphaToABGR_Opt (986 ms) R=harryjin@google.com BUG=libyuv:496 Review URL: https://codereview.chromium.org/1367093002 . --- README.chromium | 2 +- include/libyuv/convert_argb.h | 4 +- include/libyuv/row.h | 51 ++++++++++++++ include/libyuv/version.h | 2 +- source/convert_argb.cc | 129 +++++++++++++--------------------- source/row_any.cc | 30 +++++++- source/row_common.cc | 23 ++++++ source/row_win.cc | 96 +++++++++++++++++++++++++ unit_test/convert_test.cc | 16 +++-- 9 files changed, 261 insertions(+), 92 deletions(-) diff --git a/README.chromium b/README.chromium index fa0c3137d..8c29c7ca3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1492 +Version: 1493 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index a161d3343..696fa9d30 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -83,7 +83,7 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, const uint8* src_a, int src_stride_a, uint8* dst_argb, int dst_stride_argb, - int width, int height); + int width, int height, int attenuate); // Convert I420 with Alpha to preattenuated ABGR. LIBYUV_API @@ -92,7 +92,7 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, const uint8* src_a, int src_stride_a, uint8* dst_abgr, int dst_stride_abgr, - int width, int height); + int width, int height, int attenuate); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 757fa0756..acfe90047 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -187,6 +187,14 @@ extern "C" { #define HAS_I422TOABGRROW_SSSE3 #endif + +// The following are available on 32 bit x86 Visual C and clangcl. +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_IX86) +#define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I422ALPHATOABGRROW_SSSE3 +#endif + // The following are available for AVX2 Visual C and clangcl 32 bit: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ @@ -257,6 +265,7 @@ extern "C" { #endif // The following are disabled when SSSE3 is available: +// TODO(fbarchard): remove sse2. ssse3 is faster and well supported. #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) @@ -1045,6 +1054,20 @@ void I422ToARGBRow_C(const uint8* src_y, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); +void I422AlphaToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); void I422ToABGRRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1213,6 +1236,20 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); +void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1405,6 +1442,20 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width); +void I422AlphaToABGRRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width); void I411ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4081c214a..daa8755ce 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1492 +#define LIBYUV_VERSION 1493 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index dc3071926..51817da16 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -336,16 +336,15 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, const uint8* src_a, int src_stride_a, uint8* dst_argb, int dst_stride_argb, - int width, int height) { + int width, int height, int attenuate) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - struct YuvConstants* yuvconstants, - int width) = I422ToARGBRow_C; - void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = - ARGBCopyYToAlphaRow_C; + void (*I422AlphaToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || @@ -358,53 +357,37 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } -#if defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif -#if defined(HAS_I422TOARGBROW_AVX2) +#if defined(HAS_I422ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; + I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; } } #endif -#if defined(HAS_I422TOARGBROW_NEON) +#if defined(HAS_I422ALPHATOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; + I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; + I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; } } #endif -#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) +#if defined(HAS_I422ALPHATOARGBROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; - } + I422AlphaToARGBRow = I422AlphaToARGBRow_MIPS_DSPR2; } #endif #if defined(HAS_ARGBATTENUATEROW_SSE2) @@ -441,9 +424,10 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvConstants, width); - ARGBCopyYToAlphaRow(src_a, dst_argb, width); - ARGBAttenuateRow(dst_argb, dst_argb, width); + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants, width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; @@ -454,24 +438,24 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, } return 0; } -// Convert I420 with Alpha to preattenuated ABGR. + +// Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API int I420AlphaToABGR(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, const uint8* src_a, int src_stride_a, uint8* dst_abgr, int dst_stride_abgr, - int width, int height) { + int width, int height, int attenuate) { int y; - void (*I422ToABGRRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - struct YuvConstants* yuvconstants, - int width) = I422ToABGRRow_C; - void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = - ARGBCopyYToAlphaRow_C; - void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + void (*I422AlphaToABGRRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) = I422AlphaToABGRRow_C; + void (*ARGBAttenuateRow)(const uint8* src_abgr, uint8* dst_abgr, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_abgr || width <= 0 || height == 0) { @@ -483,53 +467,37 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; dst_stride_abgr = -dst_stride_abgr; } -#if defined(HAS_I422TOABGRROW_SSSE3) +#if defined(HAS_I422ALPHATOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToABGRRow = I422ToABGRRow_Any_SSSE3; + I422AlphaToABGRRow = I422AlphaToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_SSSE3; + I422AlphaToABGRRow = I422AlphaToABGRRow_SSSE3; } } #endif -#if defined(HAS_I422TOABGRROW_AVX2) +#if defined(HAS_I422ALPHATOABGRROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToABGRRow = I422ToABGRRow_Any_AVX2; + I422AlphaToABGRRow = I422AlphaToABGRRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToABGRRow = I422ToABGRRow_AVX2; + I422AlphaToABGRRow = I422AlphaToABGRRow_AVX2; } } #endif -#if defined(HAS_I422TOABGRROW_NEON) +#if defined(HAS_I422ALPHATOABGRROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToABGRRow = I422ToABGRRow_Any_NEON; + I422AlphaToABGRRow = I422AlphaToABGRRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToABGRRow = I422ToABGRRow_NEON; + I422AlphaToABGRRow = I422AlphaToABGRRow_NEON; } } #endif -#if defined(HAS_I422TOABGRROW_MIPS_DSPR2) +#if defined(HAS_I422ALPHATOABGRROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && IS_ALIGNED(dst_abgr, 4) && IS_ALIGNED(dst_stride_abgr, 4)) { - I422ToABGRRow = I422ToABGRRow_MIPS_DSPR2; - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; - } - } -#endif -#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; - } + I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2; } #endif #if defined(HAS_ARGBATTENUATEROW_SSE2) @@ -566,9 +534,10 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422ToABGRRow(src_y, src_u, src_v, dst_abgr, &kYuvConstants, width); - ARGBCopyYToAlphaRow(src_a, dst_abgr, width); - ARGBAttenuateRow(dst_abgr, dst_abgr, width); + I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants, width); + if (attenuate) { + ARGBAttenuateRow(dst_abgr, dst_abgr, width); + } dst_abgr += dst_stride_abgr; src_a += src_stride_a; src_y += src_stride_y; diff --git a/source/row_any.cc b/source/row_any.cc index 46cbdc759..cae38f7ac 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -22,6 +22,34 @@ extern "C" { // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) +// Any 4 planes to 1 with yuvconstants +#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ + const uint8* a_buf, uint8* dst_ptr, \ + struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 5]); \ + memset(temp, 0, 64 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 192, a_buf + n, r); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ + yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ + SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I422ALPHATOARGBROW_SSSE3 +ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) +ANY41C(I422AlphaToABGRRow_Any_SSSE3, I422AlphaToABGRRow_SSSE3, 1, 0, 4, 7) +#endif +#undef ANY41C + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ @@ -50,7 +78,7 @@ ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif -#undef ANY31C +#undef ANY31 // Any 3 planes to 1 with yuvconstants #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ diff --git a/source/row_common.cc b/source/row_common.cc index 27a908d73..3c025f1a4 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2412,6 +2412,29 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, } #endif +void I422AlphaToARGBRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + + I422ToARGBRow_C(src_y, src_u, src_v, dst_argb, &kYuvConstants, width); + ARGBCopyYToAlphaRow_C(src_a, dst_argb, width); +} + +void I422AlphaToABGRRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + const uint8* src_a, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) { + I422ToABGRRow_C(src_y, src_u, src_v, dst_abgr, &kYuvConstants, width); + ARGBCopyYToAlphaRow_C(src_a, dst_abgr, width); +} + #if defined(HAS_I422TOARGB1555ROW_SSSE3) void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, diff --git a/source/row_win.cc b/source/row_win.cc index f08012f55..fff337a7d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2416,6 +2416,20 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea eax, [eax + 8] \ } +// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. +#define READYUVA422 __asm { \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 4] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movq xmm4, qword ptr [eax] /* Y */ \ + __asm punpcklbw xmm4, xmm4 \ + __asm lea eax, [eax + 8] \ + __asm movq xmm5, qword ptr [ebp] /* A */ \ + __asm lea ebp, [ebp + 8] \ + } + // Read 2 UV from 411, upsample to 8 UV. #define READYUV411 __asm { \ __asm pinsrw xmm0, [esi], 0 /* U */ \ @@ -2833,6 +2847,88 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } } +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 bytes). +__declspec(naked) +void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // argb + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUVA422 + YUVTORGB(ebx) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// 8 pixels. +// 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 bytes). +__declspec(naked) +void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + const uint8* a_buf, + uint8* dst_abgr, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push esi + push edi + push ebx + push ebp + mov eax, [esp + 16 + 4] // Y + mov esi, [esp + 16 + 8] // U + mov edi, [esp + 16 + 12] // V + mov ebp, [esp + 16 + 16] // A + mov edx, [esp + 16 + 20] // abgr + mov ebx, [esp + 16 + 24] // yuvconstants + mov ecx, [esp + 16 + 28] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUVA422 + YUVTORGB(ebx) + STOREABGR + + sub ecx, 8 + jg convertloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + // 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more. diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index e0dabcf69..43258d55b 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -518,7 +518,7 @@ TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4) #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF) \ + YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN) \ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -547,7 +547,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_a + OFF, kWidth, \ dst_argb_c + OFF, kStrideB, \ - kWidth, NEG kHeight); \ + kWidth, NEG kHeight, ATTEN); \ MaskCpuFlags(-1); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ @@ -555,7 +555,7 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_a + OFF, kWidth, \ dst_argb_opt + OFF, kStrideB, \ - kWidth, NEG kHeight); \ + kWidth, NEG kHeight, ATTEN); \ } \ int max_diff = 0; \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ @@ -578,13 +578,15 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, DIFF) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0) \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1) \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0) \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0) + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, 0) \ + TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Premult, +, 0, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)