From 336e6fd25ba8eab72176925e5b6c71379f920207 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 12 Aug 2024 18:23:57 -0700 Subject: [PATCH] I010ToNV12 conversion using 2 step row function for UV - convert full Y plane with row coalescing if possible - convert rows of UV from 10 bit to 8 bit then call MergeUV libyuv_test '--gunit_filter=*010ToNV12_Opt' --libyuv_width=3840 --libyuv_height=2160 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Note: Google Test filter = *010ToNV12_Opt Skylake Xeon Was 2 pass planes [ OK ] LibYUVConvertTest.I010ToNV12_Opt (4512 ms) Now 2 pass rows [ OK ] LibYUVConvertTest.I010ToNV12_Opt (2400 ms) [ OK ] LibYUVConvertTest.P010ToNV12_Opt (2265 ms) On Samsung S23 libyuv_test --gunit_filter=*.????ToNV12_Opt --libyuv_width=3840 --libyuv_height=2160 --libyuv_repeat=1000' Was [ OK ] LibYUVConvertTest.I010ToNV12_Opt (3563 ms) Now [ OK ] LibYUVConvertTest.AYUVToNV12_Opt (3068 ms [ OK ] LibYUVConvertTest.ARGBToNV12_Opt (2990 ms [ OK ] LibYUVConvertTest.ABGRToNV12_Opt (2904 ms [ OK ] LibYUVConvertTest.P010ToNV12_Opt (1177 ms [ OK ] LibYUVConvertTest.I010ToNV12_Opt (1150 ms <- now [ OK ] LibYUVConvertTest.I444ToNV12_Opt (1118 ms [ OK ] LibYUVConvertTest.MM21ToNV12_Opt (1008 ms [ OK ] LibYUVConvertTest.UYVYToNV12_Opt (1007 ms [ OK ] LibYUVConvertTest.YUY2ToNV12_Opt (938 ms) [ OK ] LibYUVConvertTest.NV21ToNV12_Opt (496 ms) [ OK ] LibYUVConvertTest.I420ToNV12_Opt (466 ms) Bug: b/357439226, b/357721018 Change-Id: I48405929ae835b171e7d556a16794eac22c50ae9 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5782404 Reviewed-by: Wan-Teh Chang --- source/convert.cc | 144 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 115 insertions(+), 29 deletions(-) diff --git a/source/convert.cc b/source/convert.cc index 7c9f364bc..4ff63f6f9 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -11,7 +11,6 @@ #include "libyuv/convert.h" #include "libyuv/basic_types.h" -#include "libyuv/convert_from.h" // For I420ToNV12() #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" @@ -659,10 +658,22 @@ int I010ToNV12(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - int r; + int y; + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + const int scale = 16385; // 16384 for 10 bits + void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, + int width) = Convert16To8Row_C; + void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, + uint8_t* dst_uv, int width) = MergeUVRow_C; + if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. if (height < 0) { height = -height; - int halfheight = (height + 1) >> 1; + halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; @@ -670,34 +681,109 @@ int I010ToNV12(const uint16_t* src_y, src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } - - // Allocate temporary buffers for 3 planes in 8 bit. - { - align_buffer_64(temp_y, width * height); - align_buffer_64(temp_u, ((width + 1) / 2) * ((height + 1) / 2)); - align_buffer_64(temp_v, ((width + 1) / 2) * ((height + 1) / 2)); - - int temp_stride_y = width; - int temp_stride_u = (width + 1) / 2; - int temp_stride_v = (width + 1) / 2; - - // The first step is to convert 10 bit YUV I010 to 8 bit I420 with 3 planes. - r = I010ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - temp_y, temp_stride_y, temp_u, temp_stride_u, temp_v, - temp_stride_v, width, height); - if (!r) { - // The second step is to convert 8 bit I420 with 3 planes to 8 bit NV12 with 2 planes. - r = I420ToNV12(temp_y, temp_stride_y, temp_u, temp_stride_u, temp_v, - temp_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, - width, height); +#if defined(HAS_CONVERT16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert16To8Row = Convert16To8Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_NEON; } - - // Free temporary buffers. - free_aligned_buffer_64(temp_y); - free_aligned_buffer_64(temp_u); - free_aligned_buffer_64(temp_v); } - return r; +#endif +#if defined(HAS_CONVERT16TO8ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + Convert16To8Row = Convert16To8Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_SSSE3; + } + } +#endif +#if defined(HAS_CONVERT16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Convert16To8Row = Convert16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + Convert16To8Row = Convert16To8Row_AVX2; + } + } +#endif + +#if defined(HAS_MERGEUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + MergeUVRow = MergeUVRow_Any_SSE2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_SSE2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeUVRow = MergeUVRow_Any_AVX2; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_AVX2; + } + } +#endif +#if defined(HAS_MERGEUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MergeUVRow = MergeUVRow_Any_AVX512BW; + if (IS_ALIGNED(halfwidth, 32)) { + MergeUVRow = MergeUVRow_AVX512BW; + } + } +#endif +#if defined(HAS_MERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeUVRow = MergeUVRow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_NEON; + } + } +#endif +#if defined(HAS_MERGEUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + MergeUVRow = MergeUVRow_Any_MSA; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_MSA; + } + } +#endif +#if defined(HAS_MERGEUVROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + MergeUVRow = MergeUVRow_Any_LSX; + if (IS_ALIGNED(halfwidth, 16)) { + MergeUVRow = MergeUVRow_LSX; + } + } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif + + // Convert Y plane. + if (dst_y) { + Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, + height); + } + + { + // Allocate a row of uv. + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); + if (!row_u) + return 1; + + for (y = 0; y < halfheight; ++y) { + Convert16To8Row(src_u, row_u, scale, halfwidth); + Convert16To8Row(src_v, row_v, scale, halfwidth); + MergeUVRow(row_u, row_v, dst_uv, halfwidth); + src_u += src_stride_u; + src_v += src_stride_v; + dst_uv += dst_stride_uv; + } + free_aligned_buffer_64(row_u); + } + return 0; } LIBYUV_API