diff --git a/README.chromium b/README.chromium index 839f3b0f3..769605b1d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 399 +Version: 401 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4c16269bb..16d1f1774 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -42,6 +42,7 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 @@ -94,7 +95,7 @@ extern "C" { #define HAS_CUMULATIVESUMTOAVERAGE_SSE2 #endif -// The following are Windows only: +// The following are Windows only. TODO(fbarchard): Port to gcc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 @@ -116,6 +117,7 @@ extern "C" { // The following are available on Neon platforms #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_ARGBTOBAYERROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFROW_NEON #define HAS_I422TOABGRROW_NEON @@ -759,6 +761,12 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix); +void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerRow_SSSE3(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerRow_NEON(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d761e3295..deae629da 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 399 +#define LIBYUV_VERSION 401 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 1c5aa9d9f..6b4d78b39 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -115,14 +115,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#elif defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -132,6 +125,13 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -189,7 +189,6 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, return 0; } - // Convert I400 to ARGB. LIBYUV_API int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, @@ -725,21 +724,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* argb_buf, + uint8* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#elif defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; } } #endif @@ -796,21 +797,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* argb_buf, + uint8* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#elif defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#elif defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; } } #endif diff --git a/source/convert_from.cc b/source/convert_from.cc index 817dbd8b6..8e0406e55 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -701,14 +701,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#elif defined(HAS_I422TOARGBROW_SSSE3) +#if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -718,6 +711,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -739,8 +739,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, uint8* dst_bgra, int dst_stride_bgra, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_bgra || + if (!src_y || !src_u || !src_v || !dst_bgra || width <= 0 || height == 0) { return -1; } @@ -755,14 +754,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToBGRARow_C; -#if defined(HAS_I422TOBGRAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToBGRARow = I422ToBGRARow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToBGRARow = I422ToBGRARow_NEON; - } - } -#elif defined(HAS_I422TOBGRAROW_SSSE3) +#if defined(HAS_I422TOBGRAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -772,6 +764,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TOBGRAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToBGRARow = I422ToBGRARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToBGRARow = I422ToBGRARow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -793,8 +792,7 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, uint8* dst_abgr, int dst_stride_abgr, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_abgr || + if (!src_y || !src_u || !src_v || !dst_abgr || width <= 0 || height == 0) { return -1; } @@ -809,14 +807,7 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToABGRRow_C; -#if defined(HAS_I422TOABGRROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToABGRRow = I422ToABGRRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToABGRRow = I422ToABGRRow_NEON; - } - } -#elif defined(HAS_I422TOABGRROW_SSSE3) +#if defined(HAS_I422TOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -826,6 +817,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TOABGRROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToABGRRow = I422ToABGRRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToABGRRow = I422ToABGRRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -847,8 +845,7 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, uint8* dst_rgba, int dst_stride_rgba, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_rgba || + if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } @@ -863,14 +860,7 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToRGBARow_C; -#if defined(HAS_I422TORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGBARow = I422ToRGBARow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToRGBARow = I422ToRGBARow_NEON; - } - } -#elif defined(HAS_I422TORGBAROW_SSSE3) +#if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -880,6 +870,13 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGBARow = I422ToRGBARow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -901,8 +898,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, const uint8* src_v, int src_stride_v, uint8* dst_rgb24, int dst_stride_rgb24, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_rgb24 || + if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } @@ -913,24 +909,24 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, dst_stride_rgb24 = -dst_stride_rgb24; } void (*I422ToRGB24Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToRGB24Row_C; -#if defined(HAS_I422TORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRGB24Row = I422ToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToRGB24Row = I422ToRGB24Row_NEON; - } - } -#elif defined(HAS_I422TORGB24ROW_SSSE3) + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRGB24Row_C; +#if defined(HAS_I422TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } +#elif defined(HAS_I422TORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRGB24Row = I422ToRGB24Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRGB24Row = I422ToRGB24Row_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -948,12 +944,11 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, // Convert I420 to RAW. LIBYUV_API int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_raw, int dst_stride_raw, - int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_raw || + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + if (!src_y || !src_u || !src_v || !dst_raw || width <= 0 || height == 0) { return -1; } @@ -964,24 +959,24 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, dst_stride_raw = -dst_stride_raw; } void (*I422ToRAWRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToRAWRow_C; -#if defined(HAS_I422TORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToRAWRow = I422ToRAWRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToRAWRow = I422ToRAWRow_NEON; - } - } -#elif defined(HAS_I422TORAWROW_SSSE3) + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToRAWRow_C; +#if defined(HAS_I422TORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRAWRow = I422ToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRAWRow = I422ToRAWRow_SSSE3; } } +#elif defined(HAS_I422TORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToRAWRow = I422ToRAWRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToRAWRow = I422ToRAWRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -1021,11 +1016,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } } #elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } } #endif @@ -1081,11 +1082,17 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } } #elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } } #endif @@ -1141,11 +1148,17 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } } #elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } } #endif diff --git a/source/format_conversion.cc b/source/format_conversion.cc index ed12de88d..41c7de4ab 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -20,81 +20,6 @@ namespace libyuv { extern "C" { #endif -// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers -// and vst would select which 2 components to write. The low level would need -// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR - -#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -#define HAS_ARGBTOBAYERROW_SSSE3 -__declspec(naked) __declspec(align(16)) -static void ARGBToBayerRow_SSSE3(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer - movd xmm5, [esp + 12] // selector - mov ecx, [esp + 16] // pix - pshufd xmm5, xmm5, 0 - - align 16 - wloop: - movdqa xmm0, [eax] - lea eax, [eax + 16] - pshufb xmm0, xmm5 - sub ecx, 4 - movd [edx], xmm0 - lea edx, [edx + 4] - jg wloop - ret - } -} - -#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) - -#define HAS_ARGBTOBAYERROW_SSSE3 -static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - asm volatile ( - "movd %3,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - ".p2align 4 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : "g"(selector) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm5" -#endif - -); -} -#endif - -static void ARGBToBayerRow_C(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { - int index0 = selector & 0xff; - int index1 = (selector >> 8) & 0xff; - // Copy a row of Bayer. - for (int x = 0; x < pix - 1; x += 2) { - dst_bayer[0] = src_argb[index0]; - dst_bayer[1] = src_argb[index1]; - src_argb += 8; - dst_bayer += 2; - } - if (pix & 1) { - dst_bayer[0] = src_argb[index0]; - } -} - // generate a selector mask useful for pshufb static uint32 GenerateSelector(int select0, int select1) { return static_cast(select0) | @@ -147,11 +72,14 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 4) && + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } #endif const int blue_index = 0; // Offsets for ARGB format const int green_index = 1; @@ -455,15 +383,22 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, const uint8* v_buf, uint8* rgb_buf, int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + } } -#elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; +#elif defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } } #endif + SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; @@ -471,6 +406,10 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } #endif const int blue_index = 0; // Offsets for ARGB format const int green_index = 1; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 58c102639..87b6273b8 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -626,8 +626,7 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) { ARGBToRGBARow = ARGBToRGBARow_SSSE3; } -#endif -#if defined(HAS_ARGBTORGBAROW_NEON) +#elif defined(HAS_ARGBTORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBToRGBARow = ARGBToRGBARow_NEON; } @@ -657,22 +656,17 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB24Row_C; #if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - if (width * 3 <= kMaxStride) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; - } - if (IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } -#endif -#if defined(HAS_ARGBTORGB24ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width * 3 <= kMaxStride) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; - } +#elif defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } @@ -703,22 +697,17 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRAWRow_C; #if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - if (width * 3 <= kMaxStride) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; - } - if (IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } -#endif -#if defined(HAS_ARGBTORAWROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width * 3 <= kMaxStride) { - ARGBToRAWRow = ARGBToRAWRow_Any_NEON; - } +#elif defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRAWRow = ARGBToRAWRow_NEON; } @@ -749,11 +738,9 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB565Row_C; #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - if (width * 2 <= kMaxStride) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; - } + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } @@ -784,11 +771,9 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB1555Row_C; #if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - if (width * 2 <= kMaxStride) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; - } + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } @@ -819,11 +804,9 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB4444Row_C; #if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - if (width * 2 <= kMaxStride) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; - } + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } @@ -839,7 +822,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, } // Convert NV12 to RGB565. -// TODO(fbarchard): (Re) Optimize for Neon. +// TODO(fbarchard): One pass conversion. LIBYUV_API int NV12ToRGB565(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, @@ -859,22 +842,26 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, uint8* rgb_buf, int width) = NV12ToARGBRow_C; #if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_SSSE3; } -#endif -#if defined(HAS_NV12TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width * 4 <= kMaxStride) { +#elif defined(HAS_NV12TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_NEON; } #endif - + if (width * 4 > kMaxStride) { + return -1; + } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB565Row_C; #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } } #endif @@ -893,10 +880,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // Convert NV21 to RGB565. LIBYUV_API int NV21ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_vu, int src_stride_vu, + const uint8* src_uv, int src_stride_uv, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height) { - if (!src_y || !src_vu || !dst_rgb565 || width <= 0 || height == 0) { + if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -910,27 +897,36 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, uint8* rgb_buf, int width) = NV21ToARGBRow_C; #if defined(HAS_NV21TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { NV21ToARGBRow = NV21ToARGBRow_SSSE3; } +#elif defined(HAS_NV21TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_NEON; + } #endif - + if (width * 4 > kMaxStride) { + return -1; + } SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB565Row_C; #if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } } #endif for (int y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_vu, row, width); + NV21ToARGBRow(src_y, src_uv, row, width); ARGBToRGB565Row(row, dst_rgb565, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { - src_vu += src_stride_vu; + src_uv += src_stride_uv; } } return 0; diff --git a/source/row_common.cc b/source/row_common.cc index 83c0d697b..eb8b56178 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -984,90 +984,103 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -// Wrappers to handle odd width -#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \ +// YUV to RGB does multiple of 8 with SIMD and remainder with C. +#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP) \ void NAMEANY(const uint8* y_buf, \ const uint8* u_buf, \ const uint8* v_buf, \ uint8* rgb_buf, \ int width) { \ int n = width & ~7; \ - I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \ + I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ I420TORGB_C(y_buf + n, \ u_buf + (n >> UV_SHIFT), \ v_buf + (n >> UV_SHIFT), \ - rgb_buf + n * 4, width & 7); \ + rgb_buf + n * BPP, width & 7); \ } // Wrappers to handle odd width -#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \ +#define Y2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \ void NAMEANY(const uint8* y_buf, \ const uint8* uv_buf, \ uint8* rgb_buf, \ int width) { \ int n = width & ~7; \ - NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \ + NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ NV12TORGB_C(y_buf + n, \ uv_buf + (n >> UV_SHIFT), \ - rgb_buf + n * 4, width & 7); \ + rgb_buf + n * BPP, width & 7); \ } - #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0) -YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1) -YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2) -Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0) -Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0) -YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) -YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) -#endif -#ifdef HAS_I422TORGB24ROW_SSSE3 +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, + 0, 4) +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, + 1, 4) +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, + 2, 4) +Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, + 0, 4) +Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, + 0, 4) +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, + 1, 4) +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, + 1, 4) +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, + 1, 4) // I422ToRGB24Row_SSSE3 is unaligned. -YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1) -YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1) -#endif -#ifdef HAS_I422TORGBAROW_SSSE3 -YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) -#endif +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3) +#endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_NEON -YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) -YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) -YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) -YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) -Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0) -Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0) -YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1) -YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1) -#endif +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4) +YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4) +Y2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4) +Y2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4) +YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3) +YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3) +#endif // HAS_I422TOARGBROW_NEON #undef YANY -#define RGBANY(NAMEANY, ARGBTORGB, BPP) \ +// RGB to RGB does multiple of 16 pixels with SIMD and remainder with C. +// SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination. +// SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes. +// NEON RGB24 is multiple of 8 pixels, unaligned source and destination. +#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ void NAMEANY(const uint8* argb_buf, \ uint8* rgb_buf, \ int width) { \ - SIMD_ALIGNED(uint8 row[kMaxStride]); \ - ARGBTORGB(argb_buf, row, width); \ - memcpy(rgb_buf, row, width * BPP); \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(argb_buf, rgb_buf, n); \ + ARGBTORGB_C(argb_buf + n * SBPP, rgb_buf + n * BPP, width & MASK); \ } #if defined(HAS_ARGBTORGB24ROW_SSSE3) -RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) -RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3) -RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) -RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) -RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) +RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C, + 15, 4, 3) +RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C, + 15, 4, 3) +RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, + 3, 4, 2) +RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, + 3, 4, 2) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) -RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3) -RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3) #endif #undef RGBANY -#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \ +// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. +#define YANY(NAMEANY, ARGBTOY_SIMD, BPP) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ - ARGBTOY_SSE(src_argb, dst_y, width - 16); \ - ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \ + ARGBTOY_SIMD(src_argb, dst_y, width - 16); \ + ARGBTOY_SIMD(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \ } #ifdef HAS_ARGBTOYROW_SSSE3 @@ -1088,11 +1101,12 @@ YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2) #endif #undef YANY -#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ +// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. +#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~15; \ - ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ dst_u + (n >> 1), \ dst_v + (n >> 1), \ @@ -1117,11 +1131,11 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) #endif #undef UVANY -#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ +#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ void NAMEANY(const uint8* src_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~15; \ - ANYTOUV_SSE(src_argb, dst_u, dst_v, n); \ + ANYTOUV_SIMD(src_argb, dst_u, dst_v, n); \ ANYTOUV_C(src_argb + n * BPP, \ dst_u + (n >> 1), \ dst_v + (n >> 1), \ @@ -1129,15 +1143,15 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) } #ifdef HAS_YUY2TOUV422ROW_SSE2 -UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \ +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, YUY2ToUV422Row_C, 2) -UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \ +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, UYVYToUV422Row_C, 2) #endif #ifdef HAS_YUY2TOUV422ROW_NEON -UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, \ +UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, YUY2ToUV422Row_C, 2) -UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, \ +UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, UYVYToUV422Row_C, 2) #endif #undef UV422ANY @@ -1240,6 +1254,7 @@ void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } while (dst_ptr < end); } +// Blend 2 rows into 1 for conversions such as I422ToI420. void HalfRow_C(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { for (int x = 0; x < pix; ++x) { @@ -1247,6 +1262,23 @@ void HalfRow_C(const uint8* src_uv, int src_uv_stride, } } +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +void ARGBToBayerRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + for (int x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[index0]; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_neon.cc b/source/row_neon.cc index 200538de5..560206ce2 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -842,6 +842,24 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, ); } +// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG +// TODO(fbarchard): Neon port. +void ARGBToBayerRow_NEON(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + int index0 = selector & 0xff; + int index1 = (selector >> 8) & 0xff; + // Copy a row of Bayer. + for (int x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[index0]; + dst_bayer[1] = src_argb[index1]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[index0]; + } +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index 74783d370..8e584e06c 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -118,6 +118,16 @@ CONST uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; +// Shuffle table for converting ARGBToRGB24 for I420ToRGB24. First 8 + next 4 +CONST uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +CONST uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; + void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -1431,6 +1441,115 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, ); } +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb24_buf, + int width) { +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. +#ifdef __APPLE__ + asm volatile ( + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm6 \n" + :: [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24), + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0)); +#endif + + asm volatile ( +#ifndef __APPLE__ + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[rgb24_buf]) \n" + "movdqu %%xmm1,0x8(%[rgb24_buf]) \n" + "lea 0x18(%[rgb24_buf]),%[rgb24_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [rgb24_buf]"+r"(rgb24_buf), // %[rgb24_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#ifndef __APPLE__ + , [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24), + [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0) +#endif + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* raw_buf, + int width) { +#ifdef __APPLE__ + asm volatile ( + "movdqa %[kShuffleMaskARGBToRAW],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm6 \n" + :: [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW), + [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0)); +#endif + + asm volatile ( +#ifndef __APPLE__ + "movdqa %[kShuffleMaskARGBToRAW],%%xmm5 \n" + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm6 \n" +#endif + "sub %[u_buf],%[v_buf] \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READYUV422 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" + "movq %%xmm0,(%[raw_buf]) \n" + "movdqu %%xmm1,0x8(%[raw_buf]) \n" + "lea 0x18(%[raw_buf]),%[raw_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [raw_buf]"+r"(raw_buf), // %[raw_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) +#ifndef __APPLE__ + , [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW), + [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0) +#endif + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -3751,6 +3870,31 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, #endif ); } + +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $0x4,%2 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : "g"(selector) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm5" +#endif + ); +} #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 8a29f24bb..9b9573cae 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4217,6 +4217,29 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, } } +__declspec(naked) __declspec(align(16)) +void ARGBToBayerRow_SSSE3(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + movd xmm5, [esp + 12] // selector + mov ecx, [esp + 16] // pix + pshufd xmm5, xmm5, 0 + + align 16 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + pshufb xmm0, xmm5 + sub ecx, 4 + movd [edx], xmm0 + lea edx, [edx + 4] + jg wloop + ret + } +} + #endif // _M_IX86 #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 8af0bf6c2..50740d767 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -29,9 +29,9 @@ namespace libyuv { #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \ -TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N##_OptVsC) { \ - const int kWidth = 1280; \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG) \ +TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = W1280; \ const int kHeight = 720; \ align_buffer_16(src_y, kWidth * kHeight); \ align_buffer_16(src_u, kWidth / SRC_SUBSAMP_X * kHeight / SRC_SUBSAMP_Y); \ @@ -117,9 +117,11 @@ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N##_OptVsC) { \ #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280, _Opt, +) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -) \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1280,_Invert, -) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, 1276,_Any, +) TESTPLANARTOP(I420, 2, 2, I420, 2, 2) TESTPLANARTOP(I422, 2, 1, I420, 2, 2) @@ -129,9 +131,10 @@ TESTPLANARTOP(I420, 2, 2, I422, 2, 1) TESTPLANARTOP(I420, 2, 2, I444, 1, 1) TESTPLANARTOP(I420, 2, 2, I411, 4, 1) -#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, N, NEG) \ -TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ - const int kWidth = 1280; \ +#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + W1280, N, NEG) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ const int kHeight = 720; \ const int kStride = (kWidth * 8 * BPP_B + 7) / 8; \ align_buffer_16(src_y, kWidth * kHeight); \ @@ -182,8 +185,12 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ } #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ - TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1280, _Opt, +) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1280, _Invert, -) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1276, _Any, +) TESTPLANARTOB(I420, 2, 2, ARGB, 4) TESTPLANARTOB(I420, 2, 2, BGRA, 4) @@ -211,9 +218,9 @@ TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1) TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - N, NEG) \ -TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ - const int kWidth = 1280; \ + W1280, N, NEG) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ const int kHeight = 720; \ align_buffer_16(src_y, kWidth * kHeight); \ align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2); \ @@ -258,17 +265,22 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ } #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, , +) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, Invert, -) + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1280, _Opt, +) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1280, _Invert, -) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + 1276, _Any, +) TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4) TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4) TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2) TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2) -#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, N, NEG) \ -TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ - const int kWidth = 1280; \ +#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, N, NEG) \ +TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = W1280; \ const int kHeight = 720; \ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ align_buffer_16(src_argb, kStride * kHeight); \ @@ -340,8 +352,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ } #define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ - TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, , +) \ - TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, Invert, -) + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + 1280, _Opt, +) \ + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + 1280, _Invert, -) \ + TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + 1276, _Any, +) TESTATOPLANAR(ARGB, 4, I420, 2, 2) TESTATOPLANAR(BGRA, 4, I420, 2, 2) @@ -367,9 +383,9 @@ TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2) TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2) TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2) -#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, N, NEG) \ -TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ - const int kWidth = 1280; \ +#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, W1280, N, NEG) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = W1280; \ const int kHeight = 720; \ align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \ align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ @@ -403,10 +419,10 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ free_aligned_buffer_16(dst_argb_opt) \ } #define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, , +) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, Invert, -) + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, 1280, _Opt, +) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, 1280, _Invert, -) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, 1280, _Any, +) -TESTATOB(I400, 1, 1, I400, 1) TESTATOB(ARGB, 4, 4, ARGB, 4) TESTATOB(ARGB, 4, 4, BGRA, 4) TESTATOB(ARGB, 4, 4, ABGR, 4) @@ -416,6 +432,11 @@ TESTATOB(ARGB, 4, 4, RGB24, 3) TESTATOB(ARGB, 4, 4, RGB565, 2) TESTATOB(ARGB, 4, 4, ARGB1555, 2) TESTATOB(ARGB, 4, 4, ARGB4444, 2) +TESTATOB(ARGB, 4, 4, BayerBGGR, 1) +TESTATOB(ARGB, 4, 4, BayerRGGB, 1) +TESTATOB(ARGB, 4, 4, BayerGBRG, 1) +TESTATOB(ARGB, 4, 4, BayerGRBG, 1) +TESTATOB(ARGB, 4, 4, I400, 1) TESTATOB(BGRA, 4, 4, ARGB, 4) TESTATOB(ABGR, 4, 4, ARGB, 4) TESTATOB(RGBA, 4, 4, ARGB, 4) @@ -427,30 +448,37 @@ TESTATOB(ARGB4444, 2, 2, ARGB, 4) TESTATOB(YUY2, 2, 2, ARGB, 4) TESTATOB(UYVY, 2, 2, ARGB, 4) TESTATOB(M420, 3 / 2, 1, ARGB, 4) +TESTATOB(BayerBGGR, 1, 1, ARGB, 4) +TESTATOB(BayerRGGB, 1, 1, ARGB, 4) +TESTATOB(BayerGBRG, 1, 1, ARGB, 4) +TESTATOB(BayerGRBG, 1, 1, ARGB, 4) +TESTATOB(I400, 1, 1, ARGB, 4) +TESTATOB(I400, 1, 1, I400, 1) -static const int kReadPad = 16; // Allow overread of 16 bytes. -#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ +#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B) \ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ srandom(time(NULL)); \ for (int times = 0; times < benchmark_iterations_; ++times) { \ const int kWidth = (random() & 63) + 1; \ const int kHeight = (random() & 31) + 1; \ - align_buffer_page_end(src_argb, (kWidth * BPP_A) * kHeight + kReadPad); \ - align_buffer_page_end(dst_argb_c, (kWidth * BPP_B) * kHeight); \ - align_buffer_page_end(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ - for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\ + const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\ + align_buffer_page_end(src_argb, kStrideA * kHeight); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \ + for (int i = 0; i < kStrideA * kHeight; ++i) { \ src_argb[i] = (random() & 0xff); \ } \ MaskCpuFlags(kCpuInitialized); \ - FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ - dst_argb_c, kWidth * BPP_B, \ + FMT_A##To##FMT_B(src_argb, kStrideA, \ + dst_argb_c, kStrideB, \ kWidth, kHeight); \ MaskCpuFlags(-1); \ - FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ - dst_argb_opt, kWidth * BPP_B, \ + FMT_A##To##FMT_B(src_argb, kStrideA, \ + dst_argb_opt, kStrideB, \ kWidth, kHeight); \ int max_diff = 0; \ - for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \ + for (int i = 0; i < kStrideB * kHeight; ++i) { \ int abs_diff = \ abs(static_cast(dst_argb_c[i]) - \ static_cast(dst_argb_opt[i])); \ @@ -465,24 +493,31 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ } \ } -TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4) -TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4) -TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4) -TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4) -TESTATOBRANDOM(ARGB, 4, 4, RAW, 3) -TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3) -TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2) -TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2) -TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2) - -TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4) -TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4) -TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4) -TESTATOBRANDOM(RAW, 3, 3, ARGB, 4) -TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4) -TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4) -TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4) -TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4) +TESTATOBRANDOM(ARGB, 4, 4, ARGB, 4, 4) +TESTATOBRANDOM(ARGB, 4, 4, BGRA, 4, 4) +TESTATOBRANDOM(ARGB, 4, 4, ABGR, 4, 4) +TESTATOBRANDOM(ARGB, 4, 4, RGBA, 4, 4) +TESTATOBRANDOM(ARGB, 4, 4, RAW, 3, 3) +TESTATOBRANDOM(ARGB, 4, 4, RGB24, 3, 3) +TESTATOBRANDOM(ARGB, 4, 4, RGB565, 2, 2) +TESTATOBRANDOM(ARGB, 4, 4, ARGB1555, 2, 2) +TESTATOBRANDOM(ARGB, 4, 4, ARGB4444, 2, 2) +TESTATOBRANDOM(ARGB, 4, 4, I400, 1, 1) +// TODO(fbarchard): Implement YUY2 +// TESTATOBRANDOM(ARGB, 4, 4, YUY2, 4, 2) +// TESTATOBRANDOM(ARGB, 4, 4, UYVY, 4, 2) +TESTATOBRANDOM(BGRA, 4, 4, ARGB, 4, 4) +TESTATOBRANDOM(ABGR, 4, 4, ARGB, 4, 4) +TESTATOBRANDOM(RGBA, 4, 4, ARGB, 4, 4) +TESTATOBRANDOM(RAW, 3, 3, ARGB, 4, 4) +TESTATOBRANDOM(RGB24, 3, 3, ARGB, 4, 4) +TESTATOBRANDOM(RGB565, 2, 2, ARGB, 4, 4) +TESTATOBRANDOM(ARGB1555, 2, 2, ARGB, 4, 4) +TESTATOBRANDOM(ARGB4444, 2, 2, ARGB, 4, 4) +TESTATOBRANDOM(I400, 1, 1, ARGB, 4, 4) +TESTATOBRANDOM(YUY2, 4, 2, ARGB, 4, 4) +TESTATOBRANDOM(UYVY, 4, 2, ARGB, 4, 4) +TESTATOBRANDOM(I400, 1, 1, I400, 1, 1) TEST_F(libyuvTest, TestAttenuate) { SIMD_ALIGNED(uint8 orig_pixels[256][4]);