diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 30a6d3bcd..9caef1b5b 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -105,6 +105,19 @@ void MergeUVPlane(const uint8_t* src_u, int width, int height); +// Scale U and V to half width and height and merge into interleaved UV plane. +// width and height are source size, allowing odd sizes. +// Use for converting I444 or I422 to NV12. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Swap U and V channels in interleaved UV plane. LIBYUV_API void SwapUVPlane(const uint8_t* src_uv, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 45db1b91d..02abff6f4 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -273,6 +273,7 @@ extern "C" { #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 +#define HAS_HALFMERGEUVROW_SSSE3 // I210 is for H010. 2 = 422. I for 601 vs H for 709. #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 @@ -343,7 +344,6 @@ extern "C" { #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON -#define HAS_RGBATOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON @@ -353,6 +353,7 @@ extern "C" { #define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFFLOATROW_NEON +#define HAS_HALFMERGEUVROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON @@ -375,19 +376,20 @@ extern "C" { #define HAS_NV21TORGB24ROW_NEON #define HAS_NV21TOYUV24ROW_NEON #define HAS_RAWTOARGBROW_NEON -#define HAS_RAWTORGBAROW_NEON #define HAS_RAWTORGB24ROW_NEON +#define HAS_RAWTORGBAROW_NEON #define HAS_RAWTOUVROW_NEON -#define HAS_RAWTOYROW_NEON #define HAS_RAWTOYJROW_NEON +#define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOUVROW_NEON -#define HAS_RGB24TOYROW_NEON #define HAS_RGB24TOYJROW_NEON +#define HAS_RGB24TOYROW_NEON #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON #define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYJROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITRGBROW_NEON @@ -1712,6 +1714,27 @@ void MergeUVRow_Any_MMI(const uint8_t* y_buf, uint8_t* dst_ptr, int width); +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width); + void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/convert.cc b/source/convert.cc index 0645debcf..ce1152df5 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -426,7 +426,41 @@ int I444ToI420(const uint8_t* src_y, dst_v, dst_stride_v, width, height, width, height); } -// TODO(fbarchard): Implement row conversion. +LIBYUV_API +int I444ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || + height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, + dst_stride_uv, width, height); + return 0; +} + LIBYUV_API int I444ToNV21(const uint8_t* src_y, int src_stride_y, @@ -440,30 +474,9 @@ int I444ToNV21(const uint8_t* src_y, int dst_stride_vu, int width, int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - // Allocate u and v buffers - align_buffer_64(plane_u, halfwidth * halfheight * 2); - uint8_t* plane_v = plane_u + halfwidth * halfheight; - - I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, - height); - MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, - halfwidth, halfheight); - free_aligned_buffer_64(plane_u); - return 0; + return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, + src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, + width, height); } // I400 is greyscale typically used in MJPG @@ -498,46 +511,6 @@ int I400ToI420(const uint8_t* src_y, return 0; } -// TODO(fbarchard): Implement row conversion. -LIBYUV_API -int I444ToNV12(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height) { - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - // Negative height means invert the image. - if (height < 0) { - height = -height; - halfheight = (height + 1) >> 1; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - // Allocate u and v buffers - align_buffer_64(plane_u, halfwidth * halfheight * 2); - uint8_t* plane_v = plane_u + halfwidth * halfheight; - - I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, - dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, - height); - MergeUVPlane(plane_u, halfwidth, plane_v, halfwidth, dst_uv, dst_stride_uv, - halfwidth, halfheight); - free_aligned_buffer_64(plane_u); - return 0; -} - // I400 is greyscale typically used in MJPG LIBYUV_API int I400ToNV21(const uint8_t* src_y, diff --git a/source/convert_from.cc b/source/convert_from.cc index 0c95f1f29..25404afd4 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -488,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y, return 0; } -// TODO(fbarchard): test negative height for invert. LIBYUV_API int I420ToNV12(const uint8_t* src_y, int src_stride_y, @@ -502,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y, int dst_stride_uv, int width, int height) { + int halfwidth = (width + 1) / 2; + int halfheight = (height + 1) / 2; if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } - int halfwidth = (width + 1) / 2; - int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2; + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_u = src_u + (halfheight - 1) * src_stride_u; + src_v = src_v + (halfheight - 1) * src_stride_v; + src_stride_y = -src_stride_y; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index eea4fdc56..20347cfab 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4103,6 +4103,52 @@ int UYVYToNV12(const uint8_t* src_uyvy, return 0; } +// width and height are src size allowing odd size handling. +LIBYUV_API +void HalfMergeUVPlane(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, + const uint8_t* src_v, int src_stride_v, + uint8_t* dst_uv, int width) = HalfMergeUVRow_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_u = src_u + (height - 1) * src_stride_u; + src_v = src_v + (height - 1) * src_stride_v; + src_stride_u = -src_stride_u; + src_stride_v = -src_stride_v; + } +#if defined(HAS_HALFMERGEUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_NEON; + } +#endif +#if defined(HAS_HALFMERGEUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { + HalfMergeUVRow = HalfMergeUVRow_SSSE3; + } +#endif + + for (y = 0; y < height - 1; y += 2) { + // Merge a row of U and V into a row of UV. + HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); + src_u += src_stride_u * 2; + src_v += src_stride_v * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 026845b9e..67b0c7233 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -3563,6 +3563,30 @@ void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } } +void HalfMergeUVRow_C(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + + src_u[src_stride_u + 1] + 2) >> + 2; + dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + + src_v[src_stride_v + 1] + 2) >> + 2; + src_u += 2; + src_v += 2; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; + dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 9e7b9e5ed..e2088561e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1078,6 +1078,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +// clang-format off + // TODO(mraptis): Consider passing R, G, B multipliers as parameter. // round parameter is register containing value to add before shift. #define RGBTOY(round) \ @@ -1102,10 +1104,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ "prefetcht0 1280(%0) \n" \ - "paddw %%" #round \ - ",%%xmm6 \n" \ - "paddw %%" #round \ - ",%%xmm1 \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ @@ -1132,10 +1132,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "prefetcht0 1280(%0) \n" \ - "vpaddw %%" #round \ - ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round \ - ",%%ymm2,%%ymm2 \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ @@ -1146,6 +1144,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "jg 1b \n" \ "vzeroupper \n" +// clang-format on + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -7005,6 +7005,53 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { } #endif // HAS_SWAPUVROW_AVX2 +void HalfMergeUVRow_SSSE3(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + asm volatile( + "pcmpeqb %%xmm4,%%xmm4 \n" + "psrlw $0xf,%%xmm4 \n" + "packuswb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + "1: \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // load 16 U values + "movdqu (%1),%%xmm1 \n" // load 16 V values + "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row + "movdqu 0(%1,%5,1),%%xmm3 \n" + "lea 0x10(%0),%0 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" // half size + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x10(%1),%1 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "psrlw $0x1,%%xmm0 \n" + "psrlw $0x1,%%xmm1 \n" + "pavgw %%xmm5,%%xmm0 \n" + "pavgw %%xmm5,%%xmm1 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" // store 8 UV pixels + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" // 16 src pixels per loop + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_u)), // %4 + "r"((intptr_t)(src_stride_v)) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_neon.cc b/source/row_neon.cc index f358bd361..aecdf329c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2984,6 +2984,39 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { : "cc", "memory", "q0", "q1", "q2"); } +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 U values + "vld1.8 {q1}, [%2]! \n" // load 16 V values + "vld1.8 {q2}, [%1]! \n" + "vld1.8 {q3}, [%3]! \n" + "vpaddl.u8 q0, q0 \n" // half size + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q1, q3 \n" + "vqrshrn.u16 d0, q0, #2 \n" + "vqrshrn.u16 d1, q1, #2 \n" + "subs %5, %5, #16 \n" // 16 src pixels per loop + "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels + "bgt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/source/row_neon64.cc b/source/row_neon64.cc index b57147bbd..f9e0fd369 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3188,11 +3188,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" + "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 @@ -3210,18 +3211,18 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, asm volatile( "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - // pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" + "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 @@ -3265,6 +3266,41 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { : "cc", "memory", "v0", "v1", "v2"); } +void HalfMergeUVRow_NEON(const uint8_t* src_u, + int src_stride_u, + const uint8_t* src_v, + int src_stride_v, + uint8_t* dst_uv, + int width) { + const uint8_t* src_u_1 = src_u + src_stride_u; + const uint8_t* src_v_1 = src_v + src_stride_v; + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values + "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values + "ld1 {v2.16b}, [%1], #16 \n" + "ld1 {v3.16b}, [%3], #16 \n" + "uaddlp v0.8h, v0.16b \n" // half size + "uaddlp v1.8h, v1.16b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "uadalp v0.8h, v2.16b \n" + "uadalp v1.8h, v3.16b \n" + "prfm pldl1keep, [%2, 448] \n" + "uqrshrn v0.8b, v0.8h, #2 \n" + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w5, %w5, #16 \n" // 16 src pixels per loop + "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels + "b.gt 1b \n" + : "+r"(src_u), // %0 + "+r"(src_u_1), // %1 + "+r"(src_v), // %2 + "+r"(src_v_1), // %3 + "+r"(dst_uv), // %4 + "+r"(width) // %5 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index f97ad9a79..31a425356 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -21,6 +21,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/scale.h" #ifdef ENABLE_ROW_TESTS // row.h defines SIMD_ALIGNED, overriding unit_test.h @@ -3479,4 +3480,50 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) { free_aligned_buffer_page_end(orig_pixels); } +TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { + // Round count up to multiple of 16 + int dst_width = (benchmark_width_ + 1) / 2; + int dst_height = (benchmark_height_ + 1) / 2; + align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_); + align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_); + align_buffer_page_end(tmp_pixels_u, dst_width * dst_height); + align_buffer_page_end(tmp_pixels_v, dst_width * dst_height); + align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height); + align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height); + + MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_); + MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_); + MemRandomize(tmp_pixels_u, dst_width * dst_height); + MemRandomize(tmp_pixels_v, dst_width * dst_height); + MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height); + MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height); + + ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_, + benchmark_height_, + + tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear); + ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_, + benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height, + kFilterBilinear); + MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width, + dst_pixels_uv_c, dst_width * 2, dst_width, dst_height); + + for (int i = 0; i < benchmark_iterations_; ++i) { + HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, + benchmark_width_, dst_pixels_uv_opt, dst_width * 2, + benchmark_width_, benchmark_height_); + } + + for (int i = 0; i < dst_width * 2 * dst_height; ++i) { + EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]); + } + + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); + free_aligned_buffer_page_end(tmp_pixels_u); + free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(dst_pixels_uv_opt); + free_aligned_buffer_page_end(dst_pixels_uv_c); +} + } // namespace libyuv