I010ToNV12 conversion using 2 step row function for UV

- convert full Y plane with row coalescing if possible
- convert rows of UV from 10 bit to 8 bit then call MergeUV

libyuv_test '--gunit_filter=*010ToNV12_Opt' --libyuv_width=3840 --libyuv_height=2160 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
Note: Google Test filter = *010ToNV12_Opt

Skylake Xeon Was 2 pass planes
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (4512 ms)
Now 2 pass rows
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (2400 ms)
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (2265 ms)

On Samsung S23
libyuv_test --gunit_filter=*.????ToNV12_Opt --libyuv_width=3840 --libyuv_height=2160 --libyuv_repeat=1000'

Was
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (3563 ms)

Now
[       OK ] LibYUVConvertTest.AYUVToNV12_Opt (3068 ms
[       OK ] LibYUVConvertTest.ARGBToNV12_Opt (2990 ms
[       OK ] LibYUVConvertTest.ABGRToNV12_Opt (2904 ms
[       OK ] LibYUVConvertTest.P010ToNV12_Opt (1177 ms
[       OK ] LibYUVConvertTest.I010ToNV12_Opt (1150 ms <- now
[       OK ] LibYUVConvertTest.I444ToNV12_Opt (1118 ms
[       OK ] LibYUVConvertTest.MM21ToNV12_Opt (1008 ms
[       OK ] LibYUVConvertTest.UYVYToNV12_Opt (1007 ms
[       OK ] LibYUVConvertTest.YUY2ToNV12_Opt (938 ms)
[       OK ] LibYUVConvertTest.NV21ToNV12_Opt (496 ms)
[       OK ] LibYUVConvertTest.I420ToNV12_Opt (466 ms)


Bug: b/357439226, b/357721018
Change-Id: I48405929ae835b171e7d556a16794eac22c50ae9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5782404
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2024-08-12 18:23:57 -07:00
parent 5dfa75670d
commit 336e6fd25b

View File

@ -11,7 +11,6 @@
#include "libyuv/convert.h"
#include "libyuv/basic_types.h"
#include "libyuv/convert_from.h" // For I420ToNV12()
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
@ -659,10 +658,22 @@ int I010ToNV12(const uint16_t* src_y,
int dst_stride_uv,
int width,
int height) {
int r;
int y;
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
const int scale = 16385; // 16384 for 10 bits
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
int width) = Convert16To8Row_C;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
@ -670,34 +681,109 @@ int I010ToNV12(const uint16_t* src_y,
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
// Allocate temporary buffers for 3 planes in 8 bit.
{
align_buffer_64(temp_y, width * height);
align_buffer_64(temp_u, ((width + 1) / 2) * ((height + 1) / 2));
align_buffer_64(temp_v, ((width + 1) / 2) * ((height + 1) / 2));
int temp_stride_y = width;
int temp_stride_u = (width + 1) / 2;
int temp_stride_v = (width + 1) / 2;
// The first step is to convert 10 bit YUV I010 to 8 bit I420 with 3 planes.
r = I010ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
temp_y, temp_stride_y, temp_u, temp_stride_u, temp_v,
temp_stride_v, width, height);
if (!r) {
// The second step is to convert 8 bit I420 with 3 planes to 8 bit NV12 with 2 planes.
r = I420ToNV12(temp_y, temp_stride_y, temp_u, temp_stride_u, temp_v,
temp_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
width, height);
#if defined(HAS_CONVERT16TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Convert16To8Row = Convert16To8Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_NEON;
}
// Free temporary buffers.
free_aligned_buffer_64(temp_y);
free_aligned_buffer_64(temp_u);
free_aligned_buffer_64(temp_v);
}
return r;
#endif
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow = MergeUVRow_Any_SSE2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
#endif
#if defined(HAS_MERGEUVROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
MergeUVRow = MergeUVRow_Any_AVX512BW;
if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow = MergeUVRow_AVX512BW;
}
}
#endif
#if defined(HAS_MERGEUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow = MergeUVRow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_NEON;
}
}
#endif
#if defined(HAS_MERGEUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MergeUVRow = MergeUVRow_Any_MSA;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_MSA;
}
}
#endif
#if defined(HAS_MERGEUVROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
MergeUVRow = MergeUVRow_Any_LSX;
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_LSX;
}
}
#endif
#if defined(HAS_MERGEUVROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
MergeUVRow = MergeUVRow_RVV;
}
#endif
// Convert Y plane.
if (dst_y) {
Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width,
height);
}
{
// Allocate a row of uv.
align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
if (!row_u)
return 1;
for (y = 0; y < halfheight; ++y) {
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
MergeUVRow(row_u, row_v, dst_uv, halfwidth);
src_u += src_stride_u;
src_v += src_stride_v;
dst_uv += dst_stride_uv;
}
free_aligned_buffer_64(row_u);
}
return 0;
}
LIBYUV_API