diff --git a/README.chromium b/README.chromium index 823109899..1e46d348c 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1839 +Version: 1840 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 46d371593..e1eb36b62 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -151,6 +151,17 @@ int MM21ToI420(const uint8_t* src_y, int width, int height); +// Convert MM21 to YUY2 +LIBYUV_API +int MM21ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height); + // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 2a488838a..ff2a581ac 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -209,10 +209,10 @@ int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height); @@ -222,10 +222,10 @@ int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height); @@ -238,6 +238,41 @@ int ARGBToJ400(const uint8_t* src_argb, int width, int height); +// Convert ABGR to J420. (JPeg full range I420). +LIBYUV_API +int ABGRToJ420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height); + +// Convert ABGR to J422. +LIBYUV_API +int ABGRToJ422(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height); + +// Convert ABGR to J400. (JPeg full range). +LIBYUV_API +int ABGRToJ400(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height); + // Convert RGBA to J400. (JPeg full range). LIBYUV_API int RGBAToJ400(const uint8_t* src_rgba, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 1ef2256bf..f7c6db804 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -105,6 +105,17 @@ void DetileSplitUVPlane(const uint8_t* src_uv, int height, int tile_height); +// Convert a Y and UV plane of tiles into interlaced YUY2. +void DetileToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height, + int tile_height); + // Split interleaved UV plane into separate U and V planes. LIBYUV_API void SplitUVPlane(const uint8_t* src_uv, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1a1cf4b63..7ef0000b3 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -282,12 +282,14 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) +#define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ABGRTOUVJROW_SSSE3 +#define HAS_ABGRTOYJROW_SSSE3 +#define HAS_AR64TOARGBROW_SSSE3 +#define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 -#define HAS_ARGBTOAB64ROW_SSSE3 -#define HAS_AR64TOARGBROW_SSSE3 -#define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_DETILEROW_SSE2 @@ -298,12 +300,12 @@ extern "C" { #define HAS_I212TOAR30ROW_SSSE3 #define HAS_I212TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 -#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I410TOAR30ROW_SSSE3 #define HAS_I410TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 -#define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 +#define HAS_MERGEXRGBROW_SSE2 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV21TOYUV24ROW_SSSE3 #define HAS_P210TOAR30ROW_SSSE3 @@ -340,26 +342,19 @@ extern "C" { #define HAS_ABGRTOUVROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #endif +#define HAS_AB64TOARGBROW_AVX2 +#define HAS_ABGRTOUVJROW_AVX2 +#define HAS_ABGRTOYJROW_AVX2 +#define HAS_AR64TOARGBROW_AVX2 +#define HAS_ARGBTOAB64ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 +#define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 -#define HAS_ARGBTOAR64ROW_AVX2 -#define HAS_ARGBTOAB64ROW_AVX2 -#define HAS_AR64TOARGBROW_AVX2 -#define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 -#define HAS_INTERPOLATEROW_16TO8_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 -#define HAS_MERGEAR64ROW_AVX2 -#define HAS_MERGEARGB16TO8ROW_AVX2 -#define HAS_MERGEARGBROW_AVX2 -#define HAS_MERGEXR30ROW_AVX2 -#define HAS_MERGEXR64ROW_AVX2 -#define HAS_MERGEXRGB16TO8ROW_AVX2 -#define HAS_MERGEXRGBROW_AVX2 -#define HAS_NV21TOYUV24ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I212TOAR30ROW_AVX2 @@ -367,16 +362,25 @@ extern "C" { #define HAS_I400TOARGBROW_AVX2 #define HAS_I410TOAR30ROW_AVX2 #define HAS_I410TOARGBROW_AVX2 +#define HAS_I422TOAR30ROW_AVX2 +#define HAS_I422TOUYVYROW_AVX2 +#define HAS_I422TOYUY2ROW_AVX2 +#define HAS_INTERPOLATEROW_16TO8_AVX2 +#define HAS_MERGEAR64ROW_AVX2 +#define HAS_MERGEARGB16TO8ROW_AVX2 +#define HAS_MERGEARGBROW_AVX2 +#define HAS_MERGEUVROW_16_AVX2 +#define HAS_MERGEXR30ROW_AVX2 +#define HAS_MERGEXR64ROW_AVX2 +#define HAS_MERGEXRGB16TO8ROW_AVX2 +#define HAS_MERGEXRGBROW_AVX2 +#define HAS_MIRRORUVROW_AVX2 +#define HAS_MULTIPLYROW_16_AVX2 +#define HAS_NV21TOYUV24ROW_AVX2 #define HAS_P210TOAR30ROW_AVX2 #define HAS_P210TOARGBROW_AVX2 #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 -#define HAS_I422TOAR30ROW_AVX2 -#define HAS_I422TOUYVYROW_AVX2 -#define HAS_I422TOYUY2ROW_AVX2 -#define HAS_MERGEUVROW_16_AVX2 -#define HAS_MIRRORUVROW_AVX2 -#define HAS_MULTIPLYROW_16_AVX2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_AVX2 #endif @@ -433,8 +437,10 @@ extern "C" { #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON +#define HAS_ABGRTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON +#define HAS_ABGRTOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON @@ -446,6 +452,7 @@ extern "C" { #define HAS_COPYROW_NEON #define HAS_DETILEROW_NEON #define HAS_DETILESPLITUVROW_NEON +#define HAS_DETILETOYUY2_NEON #define HAS_DIVIDEROW_16_NEON #define HAS_HALFFLOATROW_NEON #define HAS_HALFMERGEUVROW_NEON @@ -574,6 +581,7 @@ extern "C" { #define HAS_ARGBTORGB565ROW_MSA #define HAS_ARGBTOUV444ROW_MSA #define HAS_ARGBTOUVJROW_MSA +#define HAS_ABGRTOUVJROW_MSA #define HAS_ARGBTOUVROW_MSA #define HAS_ARGBTOYJROW_MSA #define HAS_ARGBTOYROW_MSA @@ -1148,9 +1156,13 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); +void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); +void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1164,8 +1176,9 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); +void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); @@ -1203,6 +1216,11 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1258,6 +1276,11 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_MSA(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -1396,6 +1419,7 @@ void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); @@ -1409,6 +1433,7 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1423,6 +1448,7 @@ void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1485,6 +1511,11 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1495,6 +1526,11 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1525,6 +1561,11 @@ void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_AVX2(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1535,6 +1576,11 @@ void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_SSSE3(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1582,6 +1628,11 @@ void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_Any_NEON(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, @@ -1747,16 +1798,16 @@ void ARGBToUVJRow_C(const uint8_t* src_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ABGRToUVJRow_C(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); -void ARGBToUVJRow_C(const uint8_t* src_rgb, - int src_stride_rgb, - uint8_t* dst_u, - uint8_t* dst_v, - int width); void BGRAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -1991,6 +2042,24 @@ void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void DetileToYUY2_C(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_Any_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -2604,8 +2673,8 @@ void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); -void Convert16To8Row_Any_NEON(const uint16_t* src_y, - uint8_t* dst_y, +void Convert16To8Row_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, int scale, int width); @@ -3823,13 +3892,13 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_SSSE3(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); -void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 18c4416dc..02b66d4ee 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1839 +#define LIBYUV_VERSION 1840 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 7178580f3..37066721e 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y, height); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, dst_stride_u, src_u, dst_u, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); + /*wpp=*/1, scale, kFilterBilinear); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, dst_stride_v, src_v, dst_v, 0, 32768, dy, - /*bpp=*/1, scale, kFilterBilinear); + /*wpp=*/1, scale, kFilterBilinear); } return 0; } @@ -713,6 +713,25 @@ int MM21ToI420(const uint8_t* src_y, return 0; } +LIBYUV_API +int MM21ToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height) { + if (!src_y || !src_uv || !dst_yuy2 || width <= 0) { + return -1; + } + + DetileToYUY2(src_y, src_stride_y, src_uv, src_stride_uv, dst_yuy2, + dst_stride_yuy2, width, height, 32); + + return 0; +} + #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index e50c2af38..2f38a4882 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1858,19 +1858,19 @@ int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1879,6 +1879,22 @@ int ARGBToJ420(const uint8_t* src_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } +#if defined(HAS_ARGBTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYJRow = ARGBToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVJRow = ARGBToUVJRow_NEON; + } + } +#endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -1903,19 +1919,11 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToYJRow = ARGBToYJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_NEON; - } - } -#endif -#if defined(HAS_ARGBTOUVJROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVJRow = ARGBToUVJRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVJRow = ARGBToUVJRow_NEON; +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; } } #endif @@ -1931,16 +1939,6 @@ int ARGBToJ420(const uint8_t* src_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_LSX) && defined(HAS_ARGBTOUVJROW_LSX) - if (TestCpuFlag(kCpuHasLSX)) { - ARGBToYJRow = ARGBToYJRow_Any_LSX; - ARGBToUVJRow = ARGBToUVJRow_Any_LSX; - if (IS_ALIGNED(width, 16)) { - ARGBToYJRow = ARGBToYJRow_LSX; - ARGBToUVJRow = ARGBToUVJRow_LSX; - } - } -#endif #if defined(HAS_ARGBTOYJROW_LASX) && defined(HAS_ARGBTOUVJROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { ARGBToYJRow = ARGBToYJRow_Any_LASX; @@ -1953,16 +1951,16 @@ int ARGBToJ420(const uint8_t* src_argb, #endif for (y = 0; y < height - 1; y += 2) { - ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, src_stride_argb, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); src_argb += src_stride_argb * 2; dst_yj += dst_stride_yj * 2; - dst_u += dst_stride_u; - dst_v += dst_stride_v; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; } if (height & 1) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); } return 0; @@ -1974,19 +1972,19 @@ int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; - if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { + if (!src_argb || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1997,10 +1995,10 @@ int ARGBToJ422(const uint8_t* src_argb, } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yj == width && - dst_stride_u * 2 == width && dst_stride_v * 2 == width) { + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { width *= height; height = 1; - src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; + src_stride_argb = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -2026,6 +2024,14 @@ int ARGBToJ422(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVJRow = ARGBToUVJRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; @@ -2076,130 +2082,12 @@ int ARGBToJ422(const uint8_t* src_argb, #endif for (y = 0; y < height; ++y) { - ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, 0, dst_uj, dst_vj, width); ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; dst_yj += dst_stride_yj; - dst_u += dst_stride_u; - dst_v += dst_stride_v; - } - return 0; -} - -// Convert ARGB to AR64. -LIBYUV_API -int ARGBToAR64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ar64, - int dst_stride_ar64, - int width, - int height) { - int y; - void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAR64Row_C; - if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_ar64 = 0; - } -#if defined(HAS_ARGBTOAR64ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR64Row = ARGBToAR64Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAR64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOAR64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAR64Row = ARGBToAR64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAR64Row = ARGBToAR64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToAR64Row(src_argb, dst_ar64, width); - src_argb += src_stride_argb; - dst_ar64 += dst_stride_ar64; - } - return 0; -} - -// Convert ARGB to AB64. -LIBYUV_API -int ARGBToAB64(const uint8_t* src_argb, - int src_stride_argb, - uint16_t* dst_ab64, - int dst_stride_ab64, - int width, - int height) { - int y; - void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, - int width) = ARGBToAB64Row_C; - if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_argb = src_argb + (height - 1) * src_stride_argb; - src_stride_argb = -src_stride_argb; - } - // Coalesce rows. - if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { - width *= height; - height = 1; - src_stride_argb = dst_stride_ab64 = 0; - } -#if defined(HAS_ARGBTOAB64ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAB64Row = ARGBToAB64Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAB64ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOAB64ROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToAB64Row = ARGBToAB64Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToAB64Row = ARGBToAB64Row_NEON; - } - } -#endif - - for (y = 0; y < height; ++y) { - ARGBToAB64Row(src_argb, dst_ab64, width); - src_argb += src_stride_argb; - dst_ab64 += dst_stride_ab64; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; } return 0; } @@ -2336,6 +2224,428 @@ int RGBAToJ400(const uint8_t* src_rgba, return 0; } +// Convert ABGR to J420. (JPeg full range I420). +LIBYUV_API +int ABGRToJ420(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + ABGRToUVJRow = ABGRToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) && defined(HAS_ABGRTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + ABGRToUVJRow = ABGRToUVJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + ABGRToUVJRow = ABGRToUVJRow_LSX; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + ABGRToUVJRow(src_abgr, src_stride_abgr, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + ABGRToYJRow(src_abgr + src_stride_abgr, dst_yj + dst_stride_yj, width); + src_abgr += src_stride_abgr * 2; + dst_yj += dst_stride_yj * 2; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + if (height & 1) { + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + } + return 0; +} + +// Convert ABGR to J422. (JPeg full range I422). +LIBYUV_API +int ABGRToJ422(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + uint8_t* dst_uj, + int dst_stride_uj, + uint8_t* dst_vj, + int dst_stride_vj, + int width, + int height) { + int y; + void (*ABGRToUVJRow)(const uint8_t* src_abgr0, int src_stride_abgr, + uint8_t* dst_uj, uint8_t* dst_vj, int width) = + ABGRToUVJRow_C; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || !dst_uj || !dst_vj || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_yj == width && + dst_stride_uj * 2 == width && dst_stride_vj * 2 == width) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_yj = dst_stride_uj = dst_stride_vj = 0; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToUVJRow = ABGRToUVJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVJRow = ABGRToUVJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOUVJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToUVJRow = ABGRToUVJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVJRow = ABGRToUVJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) && defined(HAS_ABGRTOUVJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + ABGRToUVJRow = ABGRToUVJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + } + if (IS_ALIGNED(width, 32)) { + ABGRToUVJRow = ABGRToUVJRow_MSA; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LSX) && defined(HAS_ABGRTOUVJROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ABGRToYJRow = ABGRToYJRow_Any_LSX; + ABGRToUVJRow = ABGRToUVJRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_LSX; + ABGRToUVJRow = ABGRToUVJRow_LSX; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_LASX) && defined(HAS_ABGRTOUVJROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ABGRToYJRow = ABGRToYJRow_Any_LASX; + ABGRToUVJRow = ABGRToUVJRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_LASX; + ABGRToUVJRow = ABGRToUVJRow_LASX; + } + } +#endif + + for (y = 0; y < height; ++y) { + ABGRToUVJRow(src_abgr, 0, dst_uj, dst_vj, width); + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; + dst_yj += dst_stride_yj; + dst_uj += dst_stride_uj; + dst_vj += dst_stride_vj; + } + return 0; +} + +// Convert ABGR to J400. +LIBYUV_API +int ABGRToJ400(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { + int y; + void (*ABGRToYJRow)(const uint8_t* src_abgr, uint8_t* dst_yj, int width) = + ABGRToYJRow_C; + if (!src_abgr || !dst_yj || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_yj == width) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_yj = 0; + } +#if defined(HAS_ABGRTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToYJRow = ABGRToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToYJRow = ABGRToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToYJRow = ABGRToYJRow_AVX2; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ABGRToYJRow = ABGRToYJRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_NEON; + } + } +#endif +#if defined(HAS_ABGRTOYJROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ABGRToYJRow = ABGRToYJRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ABGRToYJRow = ABGRToYJRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + ABGRToYJRow(src_abgr, dst_yj, width); + src_abgr += src_stride_abgr; + dst_yj += dst_stride_yj; + } + return 0; +} + +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + // Enabled if 1 pass is available #if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) #define HAS_RAWTOYJROW @@ -2355,7 +2665,7 @@ int RAWToJNV21(const uint8_t* src_raw, int halfwidth = (width + 1) >> 1; #if defined(HAS_RAWTOYJROW) void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = RAWToUVJRow_C; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYJRow_C; @@ -2363,12 +2673,12 @@ int RAWToJNV21(const uint8_t* src_raw, void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_uj, uint8_t* dst_vj, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYJRow_C; #endif - void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, + void (*MergeUVRow_)(const uint8_t* src_uj, const uint8_t* src_vj, uint8_t* dst_vu, int width) = MergeUVRow_C; if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; @@ -2490,8 +2800,8 @@ int RAWToJNV21(const uint8_t* src_raw, #endif { // Allocate a row of uv. - align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); - uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + align_buffer_64(row_uj, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_vj = row_uj + ((halfwidth + 31) & ~31); #if !defined(HAS_RAWTOYJROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; @@ -2500,15 +2810,15 @@ int RAWToJNV21(const uint8_t* src_raw, for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RAWTOYJROW) - RAWToUVJRow(src_raw, src_stride_raw, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); RAWToYJRow(src_raw, dst_y, width); RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); - ARGBToUVJRow(row, kRowSize, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToUVJRow(row, kRowSize, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); ARGBToYJRow(row, dst_y, width); ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); #endif @@ -2518,20 +2828,20 @@ int RAWToJNV21(const uint8_t* src_raw, } if (height & 1) { #if defined(HAS_RAWTOYJROW) - RAWToUVJRow(src_raw, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + RAWToUVJRow(src_raw, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); RAWToYJRow(src_raw, dst_y, width); #else RAWToARGBRow(src_raw, row, width); - ARGBToUVJRow(row, 0, row_u, row_v, width); - MergeUVRow_(row_v, row_u, dst_vu, halfwidth); + ARGBToUVJRow(row, 0, row_uj, row_vj, width); + MergeUVRow_(row_vj, row_uj, dst_vu, halfwidth); ARGBToYJRow(row, dst_y, width); #endif } #if !defined(HAS_RAWTOYJROW) free_aligned_buffer_64(row); #endif - free_aligned_buffer_64(row_u); + free_aligned_buffer_64(row_uj); } return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 169d4a8fa..fae8630e9 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y, // tile width is 16 and assumed. // tile_height is 16 or 32 for MM21. // src_stride_y is bytes per row of source ignoring tiling. e.g. 640 -// TODO: More detile row functions. +// TODO(fbarchard): More detile row functions. LIBYUV_API void DetilePlane(const uint8_t* src_y, @@ -1033,6 +1033,66 @@ void DetileSplitUVPlane(const uint8_t* src_uv, } } +LIBYUV_API +void DetileToYUY2(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_yuy2, + int dst_stride_yuy2, + int width, + int height, + int tile_height) { + const ptrdiff_t src_y_tile_stride = 16 * tile_height; + const ptrdiff_t src_uv_tile_stride = src_y_tile_stride / 2; + int y; + void (*DetileToYUY2)(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, int width) = DetileToYUY2_C; + assert(src_stride_y >= 0); + assert(src_stride_y > 0); + assert(src_stride_uv >= 0); + assert(src_stride_uv > 0); + assert(tile_height > 0); + + if (width <= 0 || height == 0 || tile_height <= 0) { + return; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; + dst_stride_yuy2 = -dst_stride_yuy2; + } + +#if defined(HAS_DETILETOYUY2_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + DetileToYUY2 = DetileToYUY2_Any_NEON; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_NEON; + } + } +#endif + + // Detile plane + for (y = 0; y < height; ++y) { + DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, + dst_yuy2, width); + dst_yuy2 += dst_stride_yuy2; + src_y += 16; + + if (y & 0x1) { + src_uv += 16; + } + + // Advance to next row of tiles. + if ((y & (tile_height - 1)) == (tile_height - 1)) { + src_y = src_y - src_y_tile_stride + src_stride_y * tile_height; + src_uv = src_uv - src_uv_tile_stride + src_stride_uv * (tile_height / 2); + } + } +} + // Support function for NV12 etc RGB channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API diff --git a/source/row_any.cc b/source/row_any.cc index 3781a9f26..5270e86cd 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -959,6 +959,9 @@ ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) #ifdef HAS_ARGBTOYJROW_AVX2 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif +#ifdef HAS_ABGRTOYJROW_AVX2 +ANY11(ABGRToYJRow_Any_AVX2, ABGRToYJRow_AVX2, 0, 4, 1, 31) +#endif #ifdef HAS_RGBATOYJROW_AVX2 ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) #endif @@ -983,6 +986,9 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #ifdef HAS_ARGBTOYJROW_SSSE3 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYJROW_SSSE3 +ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYJROW_SSSE3 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #endif @@ -998,6 +1004,9 @@ ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31) #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_ABGRTOYJROW_NEON +ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15) +#endif #ifdef HAS_RGBATOYJROW_NEON ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) #endif @@ -2013,9 +2022,17 @@ ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif +#ifdef HAS_ABGRTOUVJROW_AVX2 +ANY12S(ABGRToUVJRow_Any_AVX2, ABGRToUVJRow_AVX2, 0, 4, 31) +#endif +#ifdef HAS_ARGBTOUVJROW_SSSE3 +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +#endif +#ifdef HAS_ABGRTOUVJROW_SSSE3 +ANY12S(ABGRToUVJRow_Any_SSSE3, ABGRToUVJRow_SSSE3, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVROW_SSSE3 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) -ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) @@ -2040,6 +2057,9 @@ ANY12S(ARGBToUVRow_Any_LASX, ARGBToUVRow_LASX, 0, 4, 31) #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif +#ifdef HAS_ABGRTOUVJROW_NEON +ANY12S(ABGRToUVJRow_Any_NEON, ABGRToUVJRow_NEON, 0, 4, 15) +#endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif @@ -2229,6 +2249,29 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15) ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) #endif +#define ANYDETILEMERGE(NAMEANY, ANY_SIMD, MASK) \ + void NAMEANY(const uint8_t* src_y, ptrdiff_t src_y_tile_stride, \ + const uint8_t* src_uv, ptrdiff_t src_uv_tile_stride, \ + uint8_t* dst_yuy2, int width) { \ + SIMD_ALIGNED(uint8_t temp[16 * 4]); \ + memset(temp, 0, 16 * 4); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, dst_yuy2, \ + n); \ + } \ + memcpy(temp, src_y + (n / 16) * src_y_tile_stride, r); \ + memcpy(temp + 16, src_uv + (n / 16) * src_uv_tile_stride, r); \ + ANY_SIMD(temp, src_y_tile_stride, temp + 16, src_uv_tile_stride, \ + temp + 32, r); \ + memcpy(dst_yuy2 + 2 * n, temp + 32, 2 * r); \ + } + +#ifdef HAS_DETILETOYUY2_NEON +ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 3f5949f9b..9d94ab289 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -798,6 +798,7 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { #endif MAKEROWYJ(ARGB, 2, 1, 0, 4) +MAKEROWYJ(ABGR, 0, 1, 2, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) MAKEROWYJ(RGB24, 2, 1, 0, 3) MAKEROWYJ(RAW, 0, 1, 2, 3) @@ -2747,6 +2748,27 @@ void DetileSplitUVRow_C(const uint8_t* src_uv, } } +void DetileToYUY2_C(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + for (int x = 0; x < width - 15; x += 16) { + for (int i = 0; i < 8; i++) { + dst_yuy2[0] = src_y[0]; + dst_yuy2[1] = src_uv[0]; + dst_yuy2[2] = src_y[1]; + dst_yuy2[3] = src_uv[1]; + dst_yuy2 += 4; + src_y += 2; + src_uv += 2; + } + src_y += src_y_tile_stride - 16; + src_uv += src_uv_tile_stride - 16; + } +} + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index dce8c4392..88766785d 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -27,6 +27,9 @@ static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; +static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u, + 77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u}; + static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) @@ -39,12 +42,18 @@ static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}; +static const vec8 kABGRToUJ = {-43, -84, 127, 0, -43, -84, 127, 0, + -43, -84, 127, 0, -43, -84, 127, 0}; + static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0}; static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0}; +static const vec8 kABGRToVJ = {127, -107, -20, 0, 127, -107, -20, 0, + 127, -107, -20, 0, 127, -107, -20, 0}; + // Constants for BGRA static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; @@ -1398,6 +1407,24 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_SSSE3 +#ifdef HAS_ABGRTOYJROW_SSSE3 +// Convert 16 ABGR pixels (64 bytes) to 16 YJ values. +// Same as ABGRToYRow but different coefficients, no add 16. +void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN RGBTOY(xmm5) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_ABGRTOYJROW_SSSE3 + #ifdef HAS_RGBATOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. @@ -1416,7 +1443,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { } #endif // HAS_RGBATOYJROW_SSSE3 -#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \ + defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif @@ -1486,6 +1514,26 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { } #endif // HAS_ARGBTOYJROW_AVX2 +#ifdef HAS_ABGRTOYJROW_AVX2 +// Convert 32 ABGR pixels (128 bytes) to 32 Y values. +void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + "vmovdqu %5,%%ymm6 \n" + + LABELALIGN RGBTOY_AVX2(ymm5) + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "m"(kABGRToYJ), // %3 + "m"(kSub128), // %4 + "m"(kPermdARGBToY_AVX) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOYJROW_AVX2 + #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -1571,11 +1619,15 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVROW_SSSE3 -#ifdef HAS_ARGBTOUVROW_AVX2 +#if defined(HAS_ARGBTOUVROW_AVX2) || defined(HAS_ABGRTOUVROW_AVX2) || \ + defined(HAS_ARGBTOUVJROW_AVX2) || defined(HAS_ABGRTOUVJROW_AVX2) // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; +#endif + +#if defined(HAS_ARGBTOUVROW_AVX2) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1765,6 +1817,71 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJROW_AVX2 +// TODO(fbarchard): Pass kABGRToVJ / kABGRToUJ as matrix +#ifdef HAS_ABGRTOUVJROW_AVX2 +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vbroadcastf128 %5,%%ymm5 \n" + "vbroadcastf128 %6,%%ymm6 \n" + "vbroadcastf128 %7,%%ymm7 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vmovdqu 0x40(%0),%%ymm2 \n" + "vmovdqu 0x60(%0),%%ymm3 \n" + "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" + "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" + "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" + "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" + "lea 0x80(%0),%0 \n" + "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" + "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" + "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" + "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" + "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" + "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" + "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" + "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm0 \n" + "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %8,%%ymm0,%%ymm0 \n" + + "vextractf128 $0x0,%%ymm0,(%1) \n" + "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kSub128), // %5 + "m"(kABGRToVJ), // %6 + "m"(kABGRToUJ), // %7 + "m"(kShufARGBToUV_AVX) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_AVX2 + #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, @@ -1831,6 +1948,72 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ABGRTOUVJROW_SSSE3 +void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "movdqa %5,%%xmm3 \n" + "movdqa %6,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,0x00(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"((intptr_t)(src_stride_abgr)), // %4 + "m"(kABGRToVJ), // %5 + "m"(kABGRToUJ), // %6 + "m"(kSub128) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); +} +#endif // HAS_ABGRTOUVJROW_SSSE3 + #ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, diff --git a/source/row_neon.cc b/source/row_neon.cc index 804ff8395..82039e9f8 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -622,6 +622,61 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, ); } +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 q0, [%0], %4 \n" // Load 16 Y + "pld [%0, 1792] \n" + "vld1.8 q1, [%1], %5 \n" // Load 8 UV + "pld [%1, 1792] \n" + "subs %3, %3, #16 \n" + "vst2.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "vld1.8 q0, [%0], %4 \n" // Load 16 Y + "vld1.8 q1, [%1], %5 \n" // Load 8 UV + "subs %3, %3, #16 \n" + "pld [%0, 1792] \n" + "vzip.8 q0, q1 \n" + "pld [%1, 1792] \n" + "vst1.8 {q0, q1}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list + ); +} +#endif + // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, @@ -1762,7 +1817,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } -// TODO(fbarchard): Subsample match C code. +// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -1808,6 +1863,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + + "vrshr.u16 q0, q0, #1 \n" // 2x average + "vrshr.u16 q1, q1, #1 \n" + "vrshr.u16 q2, q2, #1 \n" + + "subs %4, %4, #16 \n" // 16 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + // TODO(fbarchard): Subsample match C code. void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, @@ -2567,6 +2667,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); } +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 0f120373f..e166ce048 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -650,6 +650,62 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, ); } +#if LIBYUV_USE_ST2 +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "prfm pldl1keep, [%0, 1792] \n" + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "prfm pldl1keep, [%1, 1792] \n" + "subs %w3, %w3, #16 \n" // store 8 YUY2 + "st2 {v0.16b,v1.16b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1" // Clobber list + ); +} +#else +// Read 16 Y, 8 UV, and write 8 YUY2 +void DetileToYUY2_NEON(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys + "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs + "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%0, 1792] \n" + "zip1 v2.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%1, 1792] \n" + "zip2 v3.16b, v0.16b, v1.16b \n" + "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2 + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list + ); +} +#endif + #if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, @@ -2144,6 +2200,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, ); } +// TODO(fbarchard): Subsample match Intel code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2189,6 +2246,51 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, ); } +void ABGRToUVJRow_NEON(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_uj, + uint8_t* dst_vj, + int width) { + const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; + asm volatile ( + "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 + "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 + "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 + "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 + "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 + "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + + "urshr v0.8h, v0.8h, #1 \n" // 2x average + "urshr v1.8h, v1.8h, #1 \n" + "urshr v2.8h, v2.8h, #1 \n" + + "subs %w4, %w4, #16 \n" // 16 processed per loop. + RGBTOUV(v2.8h, v1.8h, v0.8h) + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_abgr_1), // %1 + "+r"(dst_uj), // %2 + "+r"(dst_vj), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} + void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, @@ -2812,6 +2914,10 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); } +void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); +} + // RGBA expects first value to be A and ignored, then 3 values to contain RGB. // Same code as ARGB, except the LD4 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, diff --git a/source/scale.cc b/source/scale.cc index e1335f1ee..4980f42d6 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src, } // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); + dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index c248b5e98..ea4a99ac1 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1236,6 +1236,8 @@ TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1) TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2) TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1) +TESTATOPLANAR(ABGR, 4, 1, J420, 2, 2) +TESTATOPLANAR(ABGR, 4, 1, J422, 2, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2) @@ -1440,6 +1442,7 @@ TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) @@ -3042,6 +3045,51 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12) TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) +TEST_F(LibYUVConvertTest, MM21ToYUY2) { + const int kWidth = (benchmark_width_ + 15) & (~15); + const int kHeight = (benchmark_height_ + 31) & (~31); + + align_buffer_page_end(orig_y, kWidth * kHeight); + align_buffer_page_end(orig_uv, + 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + align_buffer_page_end(tmp_y, kWidth * kHeight); + align_buffer_page_end(tmp_u, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + align_buffer_page_end(tmp_v, SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + align_buffer_page_end(dst_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight); + align_buffer_page_end(golden_yuyv, 4 * SUBSAMPLE(kWidth, 2) * kHeight); + + MemRandomize(orig_y, kWidth * kHeight); + MemRandomize(orig_uv, 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); + + /* Convert MM21 to YUY2 in 2 steps for reference */ + libyuv::MM21ToI420(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), tmp_y, + kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v, + SUBSAMPLE(kWidth, 2), kWidth, kHeight); + libyuv::I420ToYUY2(tmp_y, kWidth, tmp_u, SUBSAMPLE(kWidth, 2), tmp_v, + SUBSAMPLE(kWidth, 2), golden_yuyv, + 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); + + /* Convert to NV12 */ + for (int i = 0; i < benchmark_iterations_; ++i) { + libyuv::MM21ToYUY2(orig_y, kWidth, orig_uv, 2 * SUBSAMPLE(kWidth, 2), + dst_yuyv, 4 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); + } + + for (int i = 0; i < 4 * SUBSAMPLE(kWidth, 2) * kHeight; ++i) { + EXPECT_EQ(dst_yuyv[i], golden_yuyv[i]); + } + + free_aligned_buffer_page_end(orig_y); + free_aligned_buffer_page_end(orig_uv); + free_aligned_buffer_page_end(tmp_y); + free_aligned_buffer_page_end(tmp_u); + free_aligned_buffer_page_end(tmp_v); + free_aligned_buffer_page_end(dst_yuyv); + free_aligned_buffer_page_end(golden_yuyv); +} + // Transitive test. A to B to C is same as A to C. // Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere. #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \