diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 1d8c2abb8..9fc50c31d 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -31,6 +31,13 @@ void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); +// Convert I420 to I400. (calls CopyPlane ignoring u/v) +int I420ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + // I420 mirror. int I420Mirror(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -62,6 +69,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert I444 to ARGB. +int I444ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + // Convert I422 to ARGB. int I422ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -69,8 +83,8 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert I444 to ARGB. -int I444ToARGB(const uint8* src_y, int src_stride_y, +// Convert I411 to ARGB. +int I411ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_argb, int dst_stride_argb, diff --git a/source/convert_from.cc b/source/convert_from.cc index 55ff8f5c4..569945d88 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -660,32 +660,32 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_Any_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToARGBRow = I420ToARGBRow_Any_SSSE3; + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I420ToARGBRow = I420ToARGBRow_Unaligned_SSSE3; + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } } #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -708,32 +708,32 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; dst_stride_bgra = -dst_stride_bgra; } - void (*I420ToBGRARow)(const uint8* y_buf, + void (*I422ToBGRARow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToBGRARow_C; -#if defined(HAS_I420TOBGRAROW_NEON) + int width) = I422ToBGRARow_C; +#if defined(HAS_I422TOBGRAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToBGRARow = I420ToBGRARow_Any_NEON; + I422ToBGRARow = I422ToBGRARow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToBGRARow = I420ToBGRARow_NEON; + I422ToBGRARow = I422ToBGRARow_NEON; } } -#elif defined(HAS_I420TOBGRAROW_SSSE3) +#elif defined(HAS_I422TOBGRAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToBGRARow = I420ToBGRARow_Any_SSSE3; + I422ToBGRARow = I422ToBGRARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I420ToBGRARow = I420ToBGRARow_Unaligned_SSSE3; + I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3; if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I420ToBGRARow = I420ToBGRARow_SSSE3; + I422ToBGRARow = I422ToBGRARow_SSSE3; } } } #endif for (int y = 0; y < height; ++y) { - I420ToBGRARow(src_y, src_u, src_v, dst_bgra, width); + I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width); dst_bgra += dst_stride_bgra; src_y += src_stride_y; if (y & 1) { @@ -756,32 +756,32 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; dst_stride_abgr = -dst_stride_abgr; } - void (*I420ToABGRRow)(const uint8* y_buf, + void (*I422ToABGRRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToABGRRow_C; -#if defined(HAS_I420TOABGRROW_NEON) + int width) = I422ToABGRRow_C; +#if defined(HAS_I422TOABGRROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToABGRRow = I420ToABGRRow_Any_NEON; + I422ToABGRRow = I422ToABGRRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToABGRRow = I420ToABGRRow_NEON; + I422ToABGRRow = I422ToABGRRow_NEON; } } -#elif defined(HAS_I420TOABGRROW_SSSE3) +#elif defined(HAS_I422TOABGRROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToABGRRow = I420ToABGRRow_Any_SSSE3; + I422ToABGRRow = I422ToABGRRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I420ToABGRRow = I420ToABGRRow_Unaligned_SSSE3; + I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3; if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I420ToABGRRow = I420ToABGRRow_SSSE3; + I422ToABGRRow = I422ToABGRRow_SSSE3; } } } #endif for (int y = 0; y < height; ++y) { - I420ToABGRRow(src_y, src_u, src_v, dst_abgr, width); + I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width); dst_abgr += dst_stride_abgr; src_y += src_stride_y; if (y & 1) { @@ -804,18 +804,18 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif @@ -835,7 +835,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRGB24Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -859,18 +859,18 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif @@ -890,7 +890,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRAWRow(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -914,18 +914,18 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif @@ -944,7 +944,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToRGB565Row(row, dst_rgb, width); dst_rgb += dst_stride_rgb; src_y += src_stride_y; @@ -968,18 +968,18 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif @@ -998,7 +998,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToARGB1555Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; @@ -1022,18 +1022,18 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif @@ -1052,7 +1052,7 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, #endif for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToARGB4444Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; diff --git a/source/format_conversion.cc b/source/format_conversion.cc index dea1491bd..d45a719bb 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -446,18 +446,18 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); @@ -478,7 +478,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, } for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, row, width); + I422ToARGBRow(src_y, src_u, src_v, row, width); ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width); dst_bayer += dst_stride_bayer; src_y += src_stride_y; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 9239d89cc..08d8217fa 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -51,6 +51,26 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } } +// Convert I420 to I400. (calls CopyPlane ignoring u/v) +int I420ToI400(const uint8* src_y, int src_stride_y, + uint8* dst_y, int dst_stride_y, + uint8*, int, + uint8*, int, + int width, int height) { + if (!src_y || !dst_y || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + return 0; +} + // Mirror a plane of data void MirrorPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, @@ -202,50 +222,6 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, return 0; } -// Convert I422 to ARGB. -int I422ToARGB(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - void (*I420ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I420ToARGBRow = I420ToARGBRow_NEON; - } - } -#elif defined(HAS_I420TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToARGBRow = I420ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; - } - } -#endif - - for (int y = 0; y < height; ++y) { - I420ToARGBRow(src_y, src_u, src_v, dst_argb, width); - dst_argb += dst_stride_argb; - src_y += src_stride_y; - src_u += src_stride_u; - src_v += src_stride_v; - } - return 0; -} - // Convert I444 to ARGB. int I444ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -264,10 +240,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, uint8* rgb_buf, int width) = I444ToARGBRow_C; #if defined(HAS_I444TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I444ToARGBRow = I444ToARGBRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I444ToARGBRow = I444ToARGBRow_SSSE3; + } + } } #endif @@ -281,6 +261,92 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, return 0; } +// Convert I422 to ARGB. +int I422ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#elif defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I411 to ARGB. +int I411ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*I411ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I411ToARGBRow_C; +#if defined(HAS_I411TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I411ToARGBRow = I411ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + I411ToARGBRow(src_y, src_u, src_v, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + + // Convert I400 to ARGB. int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, @@ -724,24 +790,24 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_Any_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToARGBRow = I420ToARGBRow_Any_SSSE3; + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -766,7 +832,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); src_uv += src_stride_uv; } - I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); + I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } @@ -803,24 +869,24 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, } } #endif - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_Any_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToARGBRow = I420ToARGBRow_Any_SSSE3; + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -832,7 +898,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, for (int y = 0; y < height; ++y) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, rowu, rowv, width); YUY2ToYRow(src_yuy2, rowy, width); - I420ToARGBRow(rowy, rowu, rowv, dst_argb, width); + I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); src_yuy2 += src_stride_yuy2; dst_argb += dst_stride_argb; } @@ -869,24 +935,24 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, } } #endif - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_Any_NEON; + I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I420ToARGBRow = I420ToARGBRow_Any_SSSE3; + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif @@ -898,7 +964,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, for (int y = 0; y < height; ++y) { UYVYToUVRow(src_uyvy, src_stride_uyvy, rowu, rowv, width); UYVYToYRow(src_uyvy, rowy, width); - I420ToARGBRow(rowy, rowu, rowv, dst_argb, width); + I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); src_uyvy += src_stride_uyvy; dst_argb += dst_stride_argb; } @@ -916,18 +982,18 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } - void (*I420ToARGBRow)(const uint8* y_buf, + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, - int width) = I420ToARGBRow_C; -#if defined(HAS_I420TOARGBROW_NEON) + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I420ToARGBRow = I420ToARGBRow_NEON; + I422ToARGBRow = I422ToARGBRow_NEON; } -#elif defined(HAS_I420TOARGBROW_SSSE3) +#elif defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); @@ -960,7 +1026,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); src_uv += src_stride_uv; } - I420ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); + I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); ARGBToRGB565Row(row, dst_rgb, width); dst_rgb += dst_stride_rgb; src_y += src_stride_y; diff --git a/source/row.h b/source/row.h index 3c44d416d..6a4ba990b 100644 --- a/source/row.h +++ b/source/row.h @@ -30,7 +30,7 @@ extern "C" { #define LIBYUV_SSSE3_ONLY #endif -// The following are available on all x86 platforms +// The following are available on all x86 platforms: #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_ABGRTOARGBROW_SSSE3 @@ -55,10 +55,11 @@ extern "C" { #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 #define HAS_I400TOARGBROW_SSE2 -#define HAS_I420TOABGRROW_SSSE3 -#define HAS_I420TOARGBROW_SSSE3 -#define HAS_I420TOBGRAROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 +#define HAS_I411TOARGBROW_SSSE3 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROWUV_SSSE3 #define HAS_ADDROW_SSE2 @@ -75,7 +76,7 @@ extern "C" { #define HAS_ARGBSEPIAROW_SSSE3 #endif -// The following are available only useful when SSSE3 is unavailable. +// The following are disabled when SSSE3 is available: #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) @@ -91,9 +92,9 @@ extern "C" { #define HAS_MIRRORROWUV_NEON #define HAS_SPLITUV_NEON #define HAS_COPYROW_NEON -#define HAS_I420TOARGBROW_NEON -#define HAS_I420TOBGRAROW_NEON -#define HAS_I420TOABGRROW_NEON +#define HAS_I422TOARGBROW_NEON +#define HAS_I422TOBGRAROW_NEON +#define HAS_I422TOABGRROW_NEON #endif #if defined(_MSC_VER) @@ -118,17 +119,17 @@ typedef uint32 __attribute__((vector_size(16))) uvec32; #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif -void I420ToARGBRow_NEON(const uint8* y_buf, +void I422ToARGBRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToBGRARow_NEON(const uint8* y_buf, +void I422ToBGRARow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToABGRRow_NEON(const uint8* y_buf, +void I422ToABGRRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -219,19 +220,19 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -void I420ToARGBRow_C(const uint8* y_buf, +void I422ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToBGRARow_C(const uint8* y_buf, +void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToABGRRow_C(const uint8* y_buf, +void I422ToABGRRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -243,54 +244,78 @@ void I444ToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); +void I411ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); -void I420ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I420ToBGRARow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I420ToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* argb_buf, int width); +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + void YToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, + uint8* argb_buf, int width); // ARGB preattenuated alpha blend. @@ -310,24 +335,37 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); // 'Any' functions handle any size and alignment. -void I420ToARGBRow_Any_SSSE3(const uint8* y_buf, +void I444ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToBGRARow_Any_SSSE3(const uint8* y_buf, +void I422ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToABGRRow_Any_SSSE3(const uint8* y_buf, +void I411ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); +void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void I422ToABGRRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + + void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -344,19 +382,19 @@ void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void I420ToARGBRow_Any_NEON(const uint8* y_buf, +void I422ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToBGRARow_Any_NEON(const uint8* y_buf, +void I422ToBGRARow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); -void I420ToABGRRow_Any_NEON(const uint8* y_buf, +void I422ToABGRRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, diff --git a/source/row_common.cc b/source/row_common.cc index 6acfbbdcc..4fe019ce7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -359,7 +359,8 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, (255u << ashift); } -void I420ToARGBRow_C(const uint8* y_buf, +// Also used for 420 +void I422ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -377,7 +378,7 @@ void I420ToARGBRow_C(const uint8* y_buf, } } -void I420ToBGRARow_C(const uint8* y_buf, +void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -395,7 +396,7 @@ void I420ToBGRARow_C(const uint8* y_buf, } } -void I420ToABGRRow_C(const uint8* y_buf, +void I422ToABGRRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -427,6 +428,32 @@ void I444ToARGBRow_C(const uint8* y_buf, } } +void I411ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 3; x += 4) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0); + YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0); + y_buf += 4; + u_buf += 1; + v_buf += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) { for (int x = 0; x < width; ++x) { YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); @@ -686,8 +713,8 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -// Wrappers to handle odd sizes/alignments -#define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \ +// Wrappers to handle odd width +#define YANY(NAMEANY, I420TORGB_SSE, I420TORGB_C, UV_SHIFT) \ void NAMEANY(const uint8* y_buf, \ const uint8* u_buf, \ const uint8* v_buf, \ @@ -696,22 +723,24 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, int n = width & ~7; \ I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \ I420TORGB_C(y_buf + n, \ - u_buf + (n >> 1), \ - v_buf + (n >> 1), \ - rgb_buf + n * 4, width & 7); \ + u_buf + (n >> UV_SHIFT), \ + v_buf + (n >> UV_SHIFT), \ + rgb_buf + n * 4, width & 7); \ } -#if defined(HAS_I420TOARGBROW_SSSE3) -YUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_Unaligned_SSSE3, I420ToARGBRow_C) -YUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_Unaligned_SSSE3, I420ToBGRARow_C) -YUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_Unaligned_SSSE3, I420ToABGRRow_C) +#if defined(HAS_I422TOARGBROW_SSSE3) +YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0) +YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1) +YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2) +YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) +YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) #endif -#if defined(HAS_I420TOARGBROW_NEON) -YUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, I420ToARGBRow_C) -YUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, I420ToBGRARow_C) -YUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, I420ToABGRRow_C) +#if defined(HAS_I422TOARGBROW_NEON) +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C) #endif -#undef YUVANY +#undef YANY #define RGBANY(NAMEANY, ARGBTORGB, BPP) \ void NAMEANY(const uint8* argb_buf, \ diff --git a/source/row_neon.cc b/source/row_neon.cc index 810306974..8f3500cf4 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -18,7 +18,7 @@ extern "C" { // This module is for GCC Neon #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) -#define YUVTORGB \ +#define YUV422TORGB \ "vld1.u8 {d0}, [%0]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n" \ @@ -46,17 +46,17 @@ extern "C" { "vtrn.u8 d22, d23 \n" \ "vtrn.u8 d16, d17 \n" \ -#if defined(HAS_I420TOARGBROW_NEON) || \ - defined(HAS_I420TOBGRAROW_NEON) || \ - defined(HAS_I420TOABGRROW_NEON) +#if defined(HAS_I422TOARGBROW_NEON) || \ + defined(HAS_I422TOBGRAROW_NEON) || \ + defined(HAS_I422TOABGRROW_NEON) static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, 0, 0, 0, 0, 0, 0, 0, 0 }; static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, 0, 0, 0, 0, 0, 0, 0, 0 }; #endif -#ifdef HAS_I420TOARGBROW_NEON -void I420ToARGBRow_NEON(const uint8* y_buf, +#ifdef HAS_I422TOARGBROW_NEON +void I422ToARGBRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -68,7 +68,7 @@ void I420ToARGBRow_NEON(const uint8* y_buf, "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" "1: \n" -YUVTORGB + YUV422TORGB "vmov.u8 d21, d16 \n" "vmov.u8 d23, #255 \n" "vst4.u8 {d20, d21, d22, d23}, [%3]! \n" @@ -85,10 +85,10 @@ YUVTORGB "q10", "q11", "q12", "q13", "q14", "q15" ); } -#endif +#endif // HAS_I422TOARGBROW_NEON -#ifdef HAS_I420TOBGRAROW_NEON -void I420ToBGRARow_NEON(const uint8* y_buf, +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -100,7 +100,7 @@ void I420ToBGRARow_NEON(const uint8* y_buf, "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" "1: \n" -YUVTORGB + YUV422TORGB "vswp.u8 d20, d22 \n" "vmov.u8 d21, d16 \n" "vmov.u8 d19, #255 \n" @@ -118,10 +118,10 @@ YUVTORGB "q10", "q11", "q12", "q13", "q14", "q15" ); } -#endif +#endif // HAS_I422TOBGRAROW_NEON -#ifdef HAS_I420TOABGRROW_NEON -void I420ToABGRRow_NEON(const uint8* y_buf, +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, @@ -133,7 +133,7 @@ void I420ToABGRRow_NEON(const uint8* y_buf, "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" "1: \n" -YUVTORGB + YUV422TORGB "vswp.u8 d20, d22 \n" "vmov.u8 d21, d16 \n" "vmov.u8 d23, #255 \n" @@ -151,7 +151,7 @@ YUVTORGB "q10", "q11", "q12", "q13", "q14", "q15" ); } -#endif +#endif // HAS_I422TOABGRROW_NEON #ifdef HAS_SPLITUV_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v @@ -172,7 +172,7 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { : "memory", "cc", "q0", "q1" // Clobber List ); } -#endif +#endif // HAS_SPLITUV_NEON #ifdef HAS_COPYROW_NEON // Copy multiple of 64 @@ -266,7 +266,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { : "memory", "cc", "r3", "q0" ); } -#endif +#endif // HAS_MIRRORROW_NEON #ifdef HAS_MIRRORROWUV_NEON void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { @@ -325,7 +325,7 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { : "memory", "cc", "r12", "q0" ); } -#endif +#endif // HAS_MIRRORROWUV_NEON #endif // __ARM_NEON__ diff --git a/source/row_posix.cc b/source/row_posix.cc index a51207de8..479ece0ac 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1215,7 +1215,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, #endif // HAS_ARGBTOYROW_SSSE3 -#ifdef HAS_I420TOARGBROW_SSSE3 +#ifdef HAS_I422TOARGBROW_SSSE3 #define UB 127 /* min(63,static_cast(2.018 * 64)) */ #define UG -25 /* static_cast(-0.391 * 64 - 0.5) */ #define UR 0 @@ -1251,8 +1251,37 @@ struct { { YG, YG, YG, YG, YG, YG, YG, YG } }; -// Convert 8 pixels -#define YUVTORGB \ +// Convert 8 pixels: 8 UV and 8 Y +#define YUV444TORGB \ + "movq (%1),%%xmm0 \n" \ + "movq (%1,%2,1),%%xmm1 \n" \ + "lea 0x8(%1),%1 \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw (%5),%%xmm0 \n" \ + "pmaddubsw 16(%5),%%xmm1 \n" \ + "pmaddubsw 32(%5),%%xmm2 \n" \ + "psubw 48(%5),%%xmm0 \n" \ + "psubw 64(%5),%%xmm1 \n" \ + "psubw 80(%5),%%xmm2 \n" \ + "movq (%0),%%xmm3 \n" \ + "lea 0x8(%0),%0 \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%5),%%xmm3 \n" \ + "pmullw 112(%5),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +// Convert 8 pixels: 4 UV and 8 Y +#define YUV422TORGB \ "movd (%1),%%xmm0 \n" \ "movd (%1,%2,1),%%xmm1 \n" \ "lea 0x4(%1),%1 \n" \ @@ -1281,10 +1310,41 @@ struct { "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" -void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, +// Convert 8 pixels: 2 UV and 8 Y +#define YUV411TORGB \ + "movd (%1),%%xmm0 \n" \ + "movd (%1,%2,1),%%xmm1 \n" \ + "lea 0x2(%1),%1 \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw (%5),%%xmm0 \n" \ + "pmaddubsw 16(%5),%%xmm1 \n" \ + "pmaddubsw 32(%5),%%xmm2 \n" \ + "psubw 48(%5),%%xmm0 \n" \ + "psubw 64(%5),%%xmm1 \n" \ + "psubw 80(%5),%%xmm2 \n" \ + "movq (%0),%%xmm3 \n" \ + "lea 0x8(%0),%0 \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%5),%%xmm3 \n" \ + "pmullw 112(%5),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* argb_buf, int width) { asm volatile ( "sub %1,%2 \n" @@ -1292,7 +1352,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUVTORGB + YUV444TORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1306,7 +1366,7 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + "+r"(argb_buf), // %3 "+rm"(width) // %4 : "r"(&kYuvConstants.kUVToB) // %5 : "memory", "cc" @@ -1316,10 +1376,10 @@ void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf, ); } -void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* argb_buf, int width) { asm volatile ( "sub %1,%2 \n" @@ -1327,7 +1387,182 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUVTORGB + YUV422TORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(argb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + YUV411TORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(argb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + YUV444TORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(argb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + YUV422TORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(argb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + YUV411TORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(argb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + YUV422TORGB "pcmpeqb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm5 \n" @@ -1342,7 +1577,7 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + "+r"(bgra_buf), // %3 "+rm"(width) // %4 : "r"(&kYuvConstants.kUVToB) // %5 : "memory", "cc" @@ -1352,10 +1587,10 @@ void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf, ); } -void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* abgr_buf, int width) { asm volatile ( "sub %1,%2 \n" @@ -1363,7 +1598,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUVTORGB + YUV422TORGB "punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" @@ -1377,7 +1612,7 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + "+r"(abgr_buf), // %3 "+rm"(width) // %4 : "r"(&kYuvConstants.kUVToB) // %5 : "memory", "cc" @@ -1387,10 +1622,10 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ); } -void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* bgra_buf, int width) { asm volatile ( "sub %1,%2 \n" @@ -1398,42 +1633,7 @@ void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUVTORGB - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" - "1: \n" - YUVTORGB + YUV422TORGB "pcmpeqb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm5 \n" @@ -1448,7 +1648,7 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + "+r"(bgra_buf), // %3 "+rm"(width) // %4 : "r"(&kYuvConstants.kUVToB) // %5 : "memory", "cc" @@ -1458,10 +1658,10 @@ void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, ); } -void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* abgr_buf, int width) { asm volatile ( "sub %1,%2 \n" @@ -1469,7 +1669,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUVTORGB + YUV422TORGB "punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" @@ -1483,7 +1683,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + "+r"(abgr_buf), // %3 "+rm"(width) // %4 : "r"(&kYuvConstants.kUVToB) // %5 : "memory", "cc" @@ -1493,63 +1693,7 @@ void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ); } -void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - asm volatile ( - "sub %1,%2 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" - "1: \n" - "movd (%1),%%xmm0 \n" - "movd (%1,%2,1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pmaddubsw (%5),%%xmm0 \n" - "pmaddubsw 16(%5),%%xmm1 \n" - "pmaddubsw 32(%5),%%xmm2 \n" - "psubw 48(%5),%%xmm0 \n" - "psubw 64(%5),%%xmm1 \n" - "psubw 80(%5),%%xmm2 \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" - "punpcklbw %%xmm4,%%xmm3 \n" - "psubsw 96(%5),%%xmm3 \n" - "pmullw 112(%5),%%xmm3 \n" - "paddsw %%xmm3,%%xmm0 \n" - "paddsw %%xmm3,%%xmm1 \n" - "paddsw %%xmm3,%%xmm2 \n" - "psraw $0x6,%%xmm0 \n" - "psraw $0x6,%%xmm1 \n" - "psraw $0x6,%%xmm2 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm1,%%xmm1 \n" - "packuswb %%xmm2,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "sub $0x4,%4 \n" - "movdqa %%xmm0,(%3) \n" - "lea 0x10(%3),%3 \n" - "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} -#endif +#endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2 void YToARGBRow_SSE2(const uint8* y_buf, diff --git a/source/row_win.cc b/source/row_win.cc index bb36d3802..60c1e6e31 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1200,7 +1200,7 @@ __asm { } } -#ifdef HAS_I420TOARGBROW_SSSE3 +#ifdef HAS_I422TOARGBROW_SSSE3 #define YG 74 /* static_cast(1.164 * 64 + 0.5) */ @@ -1235,7 +1235,42 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; -#define YUVTORGB __asm { \ +// TODO(fbarchard): NV12/NV21 fetch UV and use directly. + +// Convert 8 pixels: 8 UV and 8 Y +#define YUV444TORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movq xmm0, qword ptr [esi] /* U */ \ + __asm movq xmm1, qword ptr [esi + edi] /* V */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// Convert 8 pixels: 4 UV and 8 Y +#define YUV422TORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ @@ -1267,11 +1302,47 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; __asm packuswb xmm2, xmm2 /* R */ \ } +// Convert 8 pixels: 2 UV and 8 Y +#define YUV411TORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movd xmm0, [esi] /* U */ \ + __asm movd xmm1, [esi + edi] /* V */ \ + __asm lea esi, [esi + 2] \ + __asm punpcklbw xmm0, xmm1 /* UV */ \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + +// 8 pixels, dest aligned 16. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes) __declspec(naked) __declspec(align(16)) -void I420ToARGBRow_SSSE3(const uint8* y_buf, +void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* argb_buf, int width) { __asm { push esi @@ -1279,7 +1350,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb + mov edx, [esp + 8 + 16] // argb mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1287,7 +1358,219 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUVTORGB + YUV444TORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUV422TORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUV411TORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUV444TORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUV422TORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, unaligned. +// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +// Similar to I420 but duplicate UV once more. +__declspec(naked) __declspec(align(16)) +void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUV411TORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1308,10 +1591,10 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf, } __declspec(naked) __declspec(align(16)) -void I420ToBGRARow_SSSE3(const uint8* y_buf, +void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* bgra_buf, int width) { __asm { push esi @@ -1319,14 +1602,14 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb + mov edx, [esp + 8 + 16] // bgra mov ecx, [esp + 8 + 20] // width sub edi, esi pxor xmm4, xmm4 align 16 convertloop: - YUVTORGB + YUV422TORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1348,10 +1631,10 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf, } __declspec(naked) __declspec(align(16)) -void I420ToABGRRow_SSSE3(const uint8* y_buf, +void I422ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* abgr_buf, int width) { __asm { push esi @@ -1359,7 +1642,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb + mov edx, [esp + 8 + 16] // abgr mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1367,7 +1650,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUVTORGB + YUV422TORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG @@ -1388,10 +1671,10 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, } __declspec(naked) __declspec(align(16)) -void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, +void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* bgra_buf, int width) { __asm { push esi @@ -1399,54 +1682,14 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 16 - convertloop: - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm2 // BGRA first 4 pixels - punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqu [edx], xmm0 - movdqu [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb + mov edx, [esp + 8 + 16] // bgra mov ecx, [esp + 8 + 20] // width sub edi, esi pxor xmm4, xmm4 align 16 convertloop: - YUVTORGB + YUV422TORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1468,10 +1711,10 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, } __declspec(naked) __declspec(align(16)) -void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, +void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* abgr_buf, int width) { __asm { push esi @@ -1479,7 +1722,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // U mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb + mov edx, [esp + 8 + 16] // abgr mov ecx, [esp + 8 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1487,7 +1730,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUVTORGB + YUV422TORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG @@ -1506,72 +1749,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ret } } - -__declspec(naked) __declspec(align(16)) -void I444ToARGBRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // rgb - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 16 - convertloop: - // Step 1: Find 4 UV contributions to 4 R,G,B values - movd xmm0, [esi] // U - movd xmm1, [esi + edi] // V - lea esi, [esi + 4] - punpcklbw xmm0, xmm1 // UV - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - pmaddubsw xmm0, kUVToB // scale B UV - pmaddubsw xmm1, kUVToG // scale G UV - pmaddubsw xmm2, kUVToR // scale R UV - psubw xmm0, kUVBiasB // unbias back to signed - psubw xmm1, kUVBiasG - psubw xmm2, kUVBiasR - - // Step 2: Find Y contribution to 4 R,G,B values - movd xmm3, [eax] - lea eax, [eax + 4] - punpcklbw xmm3, xmm4 - psubsw xmm3, kYSub16 - pmullw xmm3, kYToRgb - paddsw xmm0, xmm3 // B += Y - paddsw xmm1, xmm3 // G += Y - paddsw xmm2, xmm3 // R += Y - psraw xmm0, 6 - psraw xmm1, 6 - psraw xmm2, 6 - packuswb xmm0, xmm0 // B - packuswb xmm1, xmm1 // G - packuswb xmm2, xmm2 // R - - // Step 3: Weave into ARGB - punpcklbw xmm0, xmm1 // BG - punpcklbw xmm2, xmm5 // RA - punpcklwd xmm0, xmm2 // BGRA 4 pixels - movdqa [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - pop edi - pop esi - ret - } -} -#endif +#endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2 __declspec(naked) __declspec(align(16)) @@ -1617,7 +1795,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, ret } } -#endif +#endif // HAS_YTOARGBROW_SSE2 #endif #ifdef HAS_MIRRORROW_SSSE3 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 1ad56adf1..1b053e3a7 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -25,88 +25,44 @@ namespace libyuv { -TEST_F(libyuvTest, BenchmarkI420ToARGB_C) { - align_buffer_16(src_y, benchmark_width_ * benchmark_height_); - align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2); - align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2); - align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_); - - MaskCpuFlags(kCpuInitialized); - - for (int i = 0; i < benchmark_iterations_; ++i) - I420ToARGB(src_y, benchmark_width_, - src_u, benchmark_width_ >> 1, - src_v, benchmark_width_ >> 1, - dst_argb, benchmark_width_ << 2, - benchmark_width_, benchmark_height_); - - MaskCpuFlags(-1); - - EXPECT_EQ(0, 0); - - free_aligned_buffer_16(src_y) - free_aligned_buffer_16(src_u) - free_aligned_buffer_16(src_v) - free_aligned_buffer_16(dst_argb) -} - -TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) { - align_buffer_16(src_y, benchmark_width_ * benchmark_height_); - align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2); - align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2); - align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_); - - for (int i = 0; i < benchmark_iterations_; ++i) - I420ToARGB(src_y, benchmark_width_, - src_u, benchmark_width_ >> 1, - src_v, benchmark_width_ >> 1, - dst_argb, benchmark_width_ << 2, - benchmark_width_, benchmark_height_); - - free_aligned_buffer_16(src_y) - free_aligned_buffer_16(src_u) - free_aligned_buffer_16(src_v) - free_aligned_buffer_16(dst_argb) -} - -#define TESTI420TO(FMT, BPP) \ -TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \ +#define TESTPLANARTOB(FMT_A, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ +TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \ const int src_width = 1280; \ const int src_height = 720; \ align_buffer_16(src_y, src_width * src_height); \ - align_buffer_16(src_u, (src_width * src_height) >> 2); \ - align_buffer_16(src_v, (src_width * src_height) >> 2); \ - align_buffer_16(dst_rgb_c, (src_width * BPP) * src_height); \ - align_buffer_16(dst_rgb_opt, (src_width * BPP) * src_height); \ + align_buffer_16(src_u, src_width / SUBSAMP_X * src_height / SUBSAMP_Y); \ + align_buffer_16(src_v, src_width / SUBSAMP_X * src_height / SUBSAMP_Y); \ + align_buffer_16(dst_rgb_c, (src_width * BPP_B) * src_height); \ + align_buffer_16(dst_rgb_opt, (src_width * BPP_B) * src_height); \ srandom(time(NULL)); \ for (int i = 0; i < src_height; ++i) \ for (int j = 0; j < src_width; ++j) \ src_y[(i * src_width) + j] = (random() & 0xff); \ - for (int i = 0; i < src_height >> 1; ++i) \ - for (int j = 0; j < src_width >> 1; ++j) { \ - src_u[(i * src_width >> 1) + j] = (random() & 0xff); \ - src_v[(i * src_width >> 1) + j] = (random() & 0xff); \ + for (int i = 0; i < src_height / SUBSAMP_X; ++i) \ + for (int j = 0; j < src_width / SUBSAMP_Y; ++j) { \ + src_u[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff); \ + src_v[(i * src_width / SUBSAMP_X) + j] = (random() & 0xff); \ } \ MaskCpuFlags(kCpuInitialized); \ - I420To##FMT(src_y, src_width, \ - src_u, src_width >> 1, \ - src_v, src_width >> 1, \ - dst_rgb_c, src_width * BPP, \ + ##FMT_A##To##FMT_B(src_y, src_width, \ + src_u, src_width / SUBSAMP_X, \ + src_v, src_width / SUBSAMP_X, \ + dst_rgb_c, src_width * BPP_B, \ src_width, src_height); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ - I420To##FMT(src_y, src_width, \ - src_u, src_width >> 1, \ - src_v, src_width >> 1, \ - dst_rgb_opt, src_width * BPP, \ + ##FMT_A##To##FMT_B(src_y, src_width, \ + src_u, src_width / SUBSAMP_X, \ + src_v, src_width / SUBSAMP_X, \ + dst_rgb_opt, src_width * BPP_B, \ src_width, src_height); \ } \ int err = 0; \ for (int i = 0; i < src_height; ++i) { \ - for (int j = 0; j < src_width * BPP; ++j) { \ - int diff = static_cast(dst_rgb_c[i * src_width * BPP + j]) - \ - static_cast(dst_rgb_opt[i * src_width * BPP + j]); \ + for (int j = 0; j < src_width * BPP_B; ++j) { \ + int diff = static_cast(dst_rgb_c[i * src_width * BPP_B + j]) - \ + static_cast(dst_rgb_opt[i * src_width * BPP_B + j]); \ if (abs(diff) > 2) \ err++; \ } \ @@ -119,14 +75,17 @@ TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \ free_aligned_buffer_16(dst_rgb_opt) \ } -TESTI420TO(ARGB, 4) -TESTI420TO(BGRA, 4) -TESTI420TO(ABGR, 4) -TESTI420TO(RAW, 3) -TESTI420TO(RGB24, 3) -TESTI420TO(RGB565, 2) -TESTI420TO(ARGB1555, 2) -TESTI420TO(ARGB4444, 2) +TESTPLANARTOB(I420, 2, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BGRA, 4) +TESTPLANARTOB(I420, 2, 2, ABGR, 4) +TESTPLANARTOB(I420, 2, 2, RAW, 3) +TESTPLANARTOB(I420, 2, 2, RGB24, 3) +TESTPLANARTOB(I420, 2, 2, RGB565, 2) +TESTPLANARTOB(I420, 2, 2, ARGB1555, 2) +TESTPLANARTOB(I420, 2, 2, ARGB4444, 2) +TESTPLANARTOB(I411, 4, 1, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, ARGB, 4) +TESTPLANARTOB(I444, 1, 1, ARGB, 4) #define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B) \ TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \