From 5a9a6ea936085310f3b9fbd4a774868e6a984ec4 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 3 Feb 2025 14:44:55 -0800 Subject: [PATCH] Add RAWToI444 Skylake Xeon RAWToI444_Opt (433 ms) RAWToJ444_Opt (1781 ms) ARGBToI444_Opt (352 ms) ARGBToJ444_Opt (1577 ms) Samsung S22 Exynos ARGBToI444_Opt (283 ms) ARGBToJ444_Opt (209 ms) RAWToI444_Opt (294 ms) RAWToJ444_Opt (293 ms) Profiling on Samsung S22 Exynos 37.62%, ARGBToUV444Row_NEON_I8MM 29.42%, RAWToARGBRow_SVE2 19.61%, ARGBToYRow_NEON_DotProd Passing different --libyuv_cpu_info=N etc we can compare each ISA C 1 RAWToI444_Opt (781 ms) NEON 511 RAWToI444_Opt (757 ms) NEONDOT 1023 RAWToI444_Opt (571 ms) NEONI8MM 2047 RAWToI444_Opt (334 ms) SVE2 8191 RAWToI444_Opt (307 ms) Bug: 390247964 Change-Id: I0316fedd32222588455afa751f5b854f46bce024 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6223658 Reviewed-by: Wan-Teh Chang --- README.chromium | 2 +- include/libyuv/convert.h | 13 +++ include/libyuv/row.h | 1 - include/libyuv/version.h | 2 +- source/convert.cc | 214 +++++++++++++++++++++++++++++++++++++- unit_test/convert_test.cc | 1 + 6 files changed, 229 insertions(+), 4 deletions(-) diff --git a/README.chromium b/README.chromium index a0416e657..b44f26f62 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1902 +Version: 1903 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 9fc6d3476..750383aa3 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -924,6 +924,19 @@ int RAWToI420(const uint8_t* src_raw, int width, int height); +// RGB big endian (rgb in memory) to I444. +LIBYUV_API +int RAWToI444(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); + // RGB big endian (rgb in memory) to J420. LIBYUV_API int RAWToJ420(const uint8_t* src_raw, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d8fe2137d..9ee8af68f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -398,7 +398,6 @@ extern "C" { #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON -// TODO: Fix ARGBTOYROW and test ARGBToI444 tests pass. #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e0026d606..84f35c4d0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1902 +#define LIBYUV_VERSION 1903 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 6bceb6d97..6c37143a9 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3521,6 +3521,218 @@ int RAWToJ420(const uint8_t* src_raw, } #undef HAS_RAWTOYJROW +// RAW big endian (rgb in memory) to I444 +// 2 step conversion of RAWToARGB then ARGBToY and ARGBToUV444 +LIBYUV_API +int RAWToI444(const uint8_t* src_raw, + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { + int y; + void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = + RAWToARGBRow_C; + void (*ARGBToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = + ARGBToYRow_C; + void (*ARGBToUV444Row)(const uint8_t* src_raw, uint8_t* dst_u, uint8_t* dst_v, + int width) = ARGBToUV444Row_C; + if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_raw = src_raw + (height - 1) * src_stride_raw; + src_stride_raw = -src_stride_raw; + } + // TODO: add row coalesce when main loop handles large width in blocks + // TODO: implement UV444 or trim the ifdef below +#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToUV444Row = ARGBToUV444Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_MSA; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_LSX; + } + } +#endif +#if defined(HAS_ARGBTOUV444ROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToUV444Row = ARGBToUV444Row_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToUV444Row = ARGBToUV444Row_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYRow = ARGBToYRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYRow = ARGBToYRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON_DOTPROD) + if (TestCpuFlag(kCpuHasNeonDotProd)) { + ARGBToYRow = ARGBToYRow_Any_NEON_DotProd; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_NEON_DotProd; + } + } +#endif +#if defined(HAS_ARGBTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToYRow = ARGBToYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_MSA; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBToYRow = ARGBToYRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_LSX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBToYRow = ARGBToYRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_LASX; + } + } +#endif +#if defined(HAS_ARGBTOYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToYRow = ARGBToYRow_RVV; + } +#endif + +#if defined(HAS_RAWTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RAWToARGBRow = RAWToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RAWToARGBRow = RAWToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToARGBRow = RAWToARGBRow_NEON; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_SVE2) + if (TestCpuFlag(kCpuHasSVE2)) { + RAWToARGBRow = RAWToARGBRow_SVE2; + } +#endif +#if defined(HAS_RAWTOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + RAWToARGBRow = RAWToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_MSA; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + RAWToARGBRow = RAWToARGBRow_Any_LSX; + if (IS_ALIGNED(width, 16)) { + RAWToARGBRow = RAWToARGBRow_LSX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + RAWToARGBRow = RAWToARGBRow_Any_LASX; + if (IS_ALIGNED(width, 32)) { + RAWToARGBRow = RAWToARGBRow_LASX; + } + } +#endif +#if defined(HAS_RAWTOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RAWToARGBRow = RAWToARGBRow_RVV; + } +#endif + + { + // Allocate a row of ARGB. + const int row_size = width * 4; + align_buffer_64(row, row_size); + if (!row) + return 1; + + for (y = 0; y < height; ++y) { + RAWToARGBRow(src_raw, row, width); + ARGBToUV444Row(row, dst_u, dst_v, width); + ARGBToYRow(row, dst_y, width); + src_raw += src_stride_raw; + dst_y += dst_stride_y; + dst_u += dst_stride_u; + dst_v += dst_stride_v; + } + free_aligned_buffer_64(row); + } + return 0; +} + // RAW big endian (rgb in memory) to J444 // 2 step conversion of RAWToARGB then ARGBToYJ and ARGBToUVJ444 LIBYUV_API @@ -3714,7 +3926,7 @@ int RAWToJ444(const uint8_t* src_raw, { // Allocate a row of ARGB. - const int row_size = (width * 4 + 31) & ~31; + const int row_size = width * 4; align_buffer_64(row, row_size); if (!row) return 1; diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 498f66ffc..be36343b0 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -678,6 +678,7 @@ TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2) TESTATOPLANAR(I400, 1, 1, I420, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2) +TESTATOPLANAR(RAW, 3, 1, I444, 1, 1) TESTATOPLANAR(RAW, 3, 1, J420, 2, 2) TESTATOPLANAR(RAW, 3, 1, J444, 1, 1) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2)