From 413a8d8041f1cc5a350a47c0d81cc721e64f9fd0 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 12 Apr 2019 10:20:44 -0700 Subject: [PATCH] Add AYUVToNV12 and NV21ToNV12 BUG=libyuv:832 TESTED=out/Release/libyuv_unittest --gtest_filter=*ToNV12* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 R=rrwinterton@gmail.com Change-Id: Id03b4613211fb6a6e163d10daa7c692fe31e36d8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1560080 Commit-Queue: Frank Barchard Reviewed-by: richard winterton Reviewed-by: Frank Barchard --- README.chromium | 2 +- include/libyuv/convert.h | 11 ++ include/libyuv/planar_functions.h | 13 ++ include/libyuv/row.h | 28 ++++- include/libyuv/version.h | 2 +- source/convert.cc | 73 ++++++++++- source/convert_argb.cc | 6 +- source/planar_functions.cc | 58 ++++++++- source/row_any.cc | 5 +- source/row_common.cc | 64 ++++++++-- source/row_gcc.cc | 76 +++++------ source/row_neon.cc | 202 +++++++++++++++++++----------- source/row_neon64.cc | 199 +++++++++++++++++------------ source/row_win.cc | 8 +- source/scale_gcc.cc | 16 +-- source/scale_neon.cc | 12 +- source/scale_neon64.cc | 16 +-- unit_test/convert_test.cc | 153 +++++++++++++++++----- unit_test/planar_test.cc | 45 +++++-- util/psnr.cc | 2 +- 20 files changed, 710 insertions(+), 281 deletions(-) diff --git a/README.chromium b/README.chromium index 75bd2cfa1..f00b242a4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1725 +Version: 1727 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 892a3c91b..fb8a38533 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -226,6 +226,17 @@ int UYVYToI420(const uint8_t* src_uyvy, int width, int height); +// Convert AYUV to NV12. +LIBYUV_API +int AYUVToNV12(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert AYUV to NV21. LIBYUV_API int AYUVToNV21(const uint8_t* src_ayuv, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 91137baba..f6f5b3edd 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -224,6 +224,19 @@ int UYVYToNV12(const uint8_t* src_uyvy, int width, int height); +// Convert NV21 to NV12. +LIBYUV_API +int NV21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + LIBYUV_API int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8cfec20ef..9bb488506 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -332,6 +332,7 @@ extern "C" { #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON +#define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOYROW_NEON #define HAS_BGRATOUVROW_NEON @@ -375,6 +376,7 @@ extern "C" { #define HAS_SETROW_NEON #define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON +#define HAS_UVToVUROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON @@ -3370,17 +3372,34 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); - +void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToVURow_C(const uint8_t* src_ayuv, int stride_ayuv, +void AYUVToUVRow_C(const uint8_t* src_ayuv, + int stride_ayuv, + uint8_t* dst_uv, + int width); +void AYUVToVURow_C(const uint8_t* src_ayuv, + int stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToVURow_NEON(const uint8_t* src_ayuv, int stride_ayuv, +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int stride_ayuv, + uint8_t* dst_uv, + int width); +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, int stride_ayuv, +void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv, + int stride_ayuv, + uint8_t* dst_uv, + int width); +void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, + int stride_ayuv, uint8_t* dst_vu, int width); @@ -4010,7 +4029,6 @@ void FloatDivToByteRow_NEON(const float* src_weights, uint8_t* dst_mask, int width); - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e6bf67e16..2b8159095 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1725 +#define LIBYUV_VERSION 1727 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index b4550685e..094d6eeed 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -880,6 +880,75 @@ int UYVYToI420(const uint8_t* src_uyvy, return 0; } +// Convert AYUV to NV12. +LIBYUV_API +int AYUVToNV12(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_uv, int width) = AYUVToUVRow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToUVRow = AYUVToUVRow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToUVRow = AYUVToUVRow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToUVRow = AYUVToUVRow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToUVRow = AYUVToUVRow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToUVRow = AYUVToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToUVRow = AYUVToUVRow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + AYUVToUVRow(src_ayuv, 0, dst_uv, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + // Convert AYUV to NV21. LIBYUV_API int AYUVToNV21(const uint8_t* src_ayuv, @@ -892,8 +961,7 @@ int AYUVToNV21(const uint8_t* src_ayuv, int height) { int y; void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, - uint8_t* dst_vu, int width) = - AYUVToVURow_C; + uint8_t* dst_vu, int width) = AYUVToVURow_C; void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = AYUVToYRow_C; // Negative height means invert the image. @@ -2235,7 +2303,6 @@ int Android420ToI420(const uint8_t* src_y, return 0; } - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/convert_argb.cc b/source/convert_argb.cc index d9660b115..ffca4ea0d 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2008,10 +2008,8 @@ int NV21ToYUV24(const uint8_t* src_y, int width, int height) { int y; - void (*NV21ToYUV24Row)(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) = NV21ToYUV24Row_C; + void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu, + uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C; if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { return -1; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b49bf0a0b..9cab230f3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -440,7 +440,6 @@ void MergeUVPlane(const uint8_t* src_u, int y; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; - // Coalesce rows. // Negative height means invert the image. if (height < 0) { height = -height; @@ -504,6 +503,63 @@ void MergeUVPlane(const uint8_t* src_u, } } +// Convert NV21 to NV12. +LIBYUV_API +int NV21ToNV12(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + int y; + void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = + UVToVURow_C; + + int halfwidth = (width + 1) >> 1; + int halfheight = (height + 1) >> 1; + if (!src_vu || !dst_uv || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_y = src_y + (height - 1) * src_stride_y; + src_vu = src_vu + (halfheight - 1) * src_stride_vu; + src_stride_y = -src_stride_y; + src_stride_vu = -src_stride_vu; + } + // Coalesce rows. + if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; + halfheight = 1; + src_stride_vu = dst_stride_uv = 0; + } + +#if defined(HAS_UVToVUROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + UVToVURow = UVToVURow_Any_NEON; + if (IS_ALIGNED(halfwidth, 16)) { + UVToVURow = UVToVURow_NEON; + } + } +#endif + if (dst_y) { + CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); + } + + for (y = 0; y < halfheight; ++y) { + UVToVURow(src_vu, dst_uv, halfwidth); + src_vu += src_stride_vu; + dst_uv += dst_stride_uv; + } + return 0; +} + // Support function for NV12 etc RGB channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API diff --git a/source/row_any.cc b/source/row_any.cc index 37bd9970f..06ca723a2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -707,10 +707,12 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #ifdef HAS_UYVYTOYROW_MMI ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) #endif - #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_AYUVTOYROW_NEON +ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15) +#endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif @@ -1416,6 +1418,7 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) } #ifdef HAS_AYUVTOVUROW_NEON +ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif #undef ANY11S diff --git a/source/row_common.cc b/source/row_common.cc index 257daa6c0..8951d0037 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -3236,14 +3236,13 @@ void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - int x; for (x = 0; x < width - 1; x += 2) { dst_yuv24[0] = src_vu[0]; // V - dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[1] = src_vu[1]; // U dst_yuv24[2] = src_y[0]; // Y0 dst_yuv24[3] = src_vu[0]; // V - dst_yuv24[4] = src_vu[1]; // U + dst_yuv24[4] = src_vu[1]; // U dst_yuv24[5] = src_y[1]; // Y1 src_y += 2; src_vu += 2; @@ -3256,6 +3255,33 @@ void NV21ToYUV24Row_C(const uint8_t* src_y, } } +// Filter 2 rows of AYUV UV's (444) into UV (420). +void AYUVToUVRow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + // Output a row of UV values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width; x += 2) { + dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; + dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + src_ayuv += 8; + dst_uv += 2; + } + if (width & 1) { + dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 0] + 2) >> + 2; + dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 1] + 2) >> + 2; + } +} + // Filter 2 rows of AYUV UV's (444) into VU (420). void AYUVToVURow_C(const uint8_t* src_ayuv, int src_stride_ayuv, @@ -3264,15 +3290,23 @@ void AYUVToVURow_C(const uint8_t* src_ayuv, // Output a row of VU values, filtering 2x2 rows of AYUV. int x; for (x = 0; x < width; x += 2) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; - dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; + dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 4] + 2) >> + 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 5] + 2) >> + 2; src_ayuv += 8; dst_vu += 2; } if (width & 1) { - dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 0] + 2) >> 2; - dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 1] + 2) >> 2; - } + dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + + src_ayuv[src_stride_ayuv + 0] + 2) >> + 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + + src_ayuv[src_stride_ayuv + 1] + 2) >> + 2; + } } // Copy row of AYUV Y's into Y @@ -3280,11 +3314,23 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width; ++x) { - dst_y[x] = src_ayuv[2]; // v,u,y,a + dst_y[x] = src_ayuv[2]; // v,u,y,a src_ayuv += 4; } } +void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + int x; + for (x = 0; x < width; ++x) { + uint8_t u = src_uv[0]; + uint8_t v = src_uv[1]; + dst_vu[0] = v; + dst_vu[1] = u; + src_uv += 2; + dst_vu += 2; + } +} + // divide values by weights and provide mask to indicate weight of 0. void FloatDivToByteRow_C(const float* src_weights, const float* src_values, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 99d73053f..decd3d2e4 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5238,7 +5238,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif - ); + ); } #endif // HAS_ARGBMULTIPLYROW_AVX2 @@ -6669,7 +6669,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 - #ifdef HAS_NV21TOYUV24ROW_AVX2 // begin NV21ToYUV24Row_C avx2 constants @@ -6723,48 +6722,54 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - uint8_t* src_y_ptr; uint64_t src_offset = 0; uint64_t width64; width64 = width; - src_y_ptr = (uint8_t *) src_y; + src_y_ptr = (uint8_t*)src_y; asm volatile( - "vmovdqu %5, %%ymm0 \n" //init blend value - "vmovdqu %6, %%ymm1 \n" //init blend value - "vmovdqu %7, %%ymm2 \n" //init blend value -// "sub $0x20, %3 \n" //sub 32 from width for final loop + "vmovdqu %5, %%ymm0 \n" // init blend value + "vmovdqu %6, %%ymm1 \n" // init blend value + "vmovdqu %7, %%ymm2 \n" // init blend value + // "sub $0x20, %3 \n" //sub 32 from width for final loop LABELALIGN - "1: \n" //label 1 - "vmovdqu (%0,%4), %%ymm3 \n" //src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" //src_uv+1 - "vmovdqu (%1), %%ymm5 \n" //src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" //y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" //uv+1, kSHUF1 for shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" //uv, kSHUF2 for shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" //y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" //uv+1 kSHUF4 for shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" //blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" //blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" //blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" //blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" //shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" //get results - "vmovdqu %%ymm12, 0x20(%2) \n" //store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" //get results - "add $0x20, %4 \n" //add to src buffer ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" //insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" //insert - "vmovdqu %%ymm4, (%2) \n" //store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" //store dst_yuv+40h - "add $0x60,%2 \n" //add to dst buffer ptr -// "cmp %3, %4 \n" //(width64 - 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop + "1: \n" // label 1 + "vmovdqu (%0,%4), %%ymm3 \n" // src_y + "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 + "vmovdqu (%1), %%ymm5 \n" // src_uv + "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf + "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for + // shuf + "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for + // shuf + "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf + "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for + // shuf + "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 + "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 + "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 + "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 + "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const + "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results + "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h + "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results + "add $0x20, %4 \n" // add to src buffer + // ptr + "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert + "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert + "vmovdqu %%ymm4, (%2) \n" // store dst_yuv + "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h + "add $0x60,%2 \n" // add to dst buffer + // ptr + // "cmp %3, %4 \n" //(width64 - + // 32 bytes) and src_offset + "sub $0x20,%3 \n" // 32 pixels per loop "jg 1b \n" - "vzeroupper \n" //sse-avx2 transistions + "vzeroupper \n" // sse-avx2 + // transistions : "+r"(src_y), //%0 "+r"(src_vu), //%1 @@ -6780,7 +6785,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, "m"(kSHUF3), //%11 "m"(kSHUF4), //%12 "m"(kSHUF5) //%13 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", "xmm13", "xmm14", "xmm15"); + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", + "xmm13", "xmm14", "xmm15"); } #endif // HAS_NV21TOYUV24ROW_AVX2 diff --git a/source/row_neon.cc b/source/row_neon.cc index e440209fc..baf57495c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -561,7 +561,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. @@ -582,7 +582,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. @@ -607,7 +607,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, "+r"(width) // %4 : // Input registers : "cc", "memory", "d0", "d1", "d2" // Clobber List - ); + ); } // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time @@ -632,7 +632,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "+r"(width) // %4 : // Input registers : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); + ); } // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. @@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "+r"(width) // %2 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } // SetRow writes 'width' bytes using an 8 bit value repeated. @@ -761,7 +761,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); + ); } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { @@ -778,7 +778,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); + ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { @@ -795,7 +795,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3" // Clobber List - ); + ); } #define RGB565TOARGB \ @@ -826,7 +826,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); + ); } #define ARGB1555TOARGB \ @@ -872,7 +872,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); + ); } #define ARGB4444TOARGB \ @@ -901,7 +901,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); + ); } void ARGBToRGB24Row_NEON(const uint8_t* src_argb, @@ -919,7 +919,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); + ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { @@ -935,7 +935,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List - ); + ); } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { @@ -950,7 +950,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { @@ -965,7 +965,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, @@ -985,7 +985,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + ); } void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, @@ -1005,7 +1005,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List - ); + ); } void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, @@ -1032,7 +1032,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); + ); } void UYVYToUVRow_NEON(const uint8_t* src_uyvy, @@ -1059,7 +1059,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List - ); + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. @@ -1081,7 +1081,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "q0", "q1", "q2" // Clobber List - ); + ); } void I422ToYUY2Row_NEON(const uint8_t* src_y, @@ -1241,7 +1241,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List - ); + ); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -2564,7 +2564,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, : "r"(2), // %5 "r"(6) // %6 : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } // SobelY as a matrix is @@ -2601,7 +2601,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, : "r"(1), // %4 "r"(6) // %5 : "cc", "memory", "q0", "q1" // Clobber List - ); + ); } // %y passes a float as a scalar vector for vector * scalar multiply. @@ -2690,70 +2690,120 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( - "1: \n" - "vld1.8 {q2}, [%0]! \n" // load 16 Y values - "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values - "vmov d1, d0 \n" - "vzip.u8 d0, d1 \n" // VV - "vmov d3, d2 \n" - "vzip.u8 d2, d3 \n" // UU - "subs %3, %3, #16 \n" // 16 pixels per loop - "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels - "vst3.8 {d1, d3, d5}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2"); + asm volatile( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d0, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); } void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { - asm volatile ( - "add %1, %0, %1 \n" // src_stride + src_AYUV - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels. - "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. - "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV pixels. - "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV pixels. - "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. - "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average - "vqrshrun.s16 d1, q1, #2 \n" - "subs %3, %3, #16 \n" // 16 processed per loop. - "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_stride_ayuv), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); + asm volatile( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV + // pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV + // pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV + // pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d1, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); } // Copy row of AYUV Y's into Y. // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels - "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.8 {q2}, [%1]! \n" // store 16 Y's. - "bgt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "q0", "q1", "q2", "q3"); + asm volatile( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +// Convert biplanar UV channel of NV12 to NV21 +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values + "vld2.8 {d1, d3}, [%0]! \n" + "vorr.u8 q2, q0, q0 \n" // move U after V + "subs %2, %2, #16 \n" // 16 pixels per loop + "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels + "bgt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 5d045f645..449c9f394 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // Reads 16 U's and V's and writes out 16 pairs of UV. @@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. @@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time @@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } // Copy multiple of 32. @@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "+r"(width) // %2 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } // SetRow writes 'width' bytes using an 8 bit value repeated. @@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { @@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); + ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { @@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } #define RGB565TOARGB \ @@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List - ); + ); } #define ARGB1555TOARGB \ @@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } #define ARGB4444TOARGB \ @@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void ARGBToRGB24Row_NEON(const uint8_t* src_argb, @@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List - ); + ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { @@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List - ); + ); } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { @@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { @@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List - ); + ); } void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, @@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, @@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, @@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List - ); + ); } void UYVYToUVRow_NEON(const uint8_t* src_uyvy, @@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List - ); + ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. @@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "v0", "v1", "v2" // Clobber List - ); + ); } void I422ToYUY2Row_NEON(const uint8_t* src_y, @@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" - ); + ); } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { @@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, : "r"(2LL), // %5 "r"(6LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } // SobelY as a matrix is @@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, : "r"(1LL), // %4 "r"(6LL) // %5 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } // Caveat - rounds float to half float whereas scaling version truncates. @@ -2879,23 +2879,51 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, - uint8_t* dst_yuv24, - int width) { - asm volatile ( - "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values - "subs %w3, %w3, #16 \n" // 16 pixels per loop - "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_yuv24), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2"); + uint8_t* dst_yuv24, + int width) { + asm volatile( + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void AYUVToUVRow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_uv, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v2.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void AYUVToVURow_NEON(const uint8_t* src_ayuv, @@ -2905,40 +2933,41 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. - "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average - "uqrshrn v1.8b, v1.8h, #2 \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop. - "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(src_ayuv_1), // %1 - "+r"(dst_vu), // %2 - "+r"(width) // %3 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels - "subs %w2, %w2, #16 \n" // 16 pixels per loop - "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels - "b.gt 1b \n" - : "+r"(src_ayuv), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3"); + asm volatile( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); } void FloatDivToByteRow_NEON(const float* src_weights, @@ -2962,7 +2991,7 @@ void FloatDivToByteRow_NEON(const float* src_weights, "uqxtn v1.4h, v1.4s \n" // 8 shorts "uqxtn2 v1.8h, v2.4s \n" "uqxtn v1.8b, v1.8h \n" // 8 bytes - + "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero @@ -2974,15 +3003,31 @@ void FloatDivToByteRow_NEON(const float* src_weights, "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask "b.gt 1b \n" - : "+r"(src_weights), // %0 - "+r"(src_values), // %1 - "+r"(dst_out), // %2 - "+r"(dst_mask), // %3 - "+r"(width) // %4 + : "+r"(src_weights), // %0 + "+r"(src_values), // %1 + "+r"(dst_out), // %2 + "+r"(dst_mask), // %3 + "+r"(width) // %4 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); } +// Convert biplanar UV channel of NV12 to NV21 +void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { + asm volatile( + "1: \n" + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values + "orr v2.16b, v0.16b, v0.16b \n" // move U after V + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels + "b.gt 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 5500d7f5a..4484112c3 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4222,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, add ecx, 4 - 1 jl convertloop1b - // 1 pixel loop. + // 1 pixel loop. convertloop1: movd xmm3, [eax] // src argb lea eax, [eax + 4] @@ -5360,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5448,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. lea eax, [eax + 4] @@ -5534,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: cvttps2dq xmm0, xmm2 // x, y float to int packssdw xmm0, xmm0 // x, y as shorts diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 312236d2d..90a49f30d 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -483,7 +483,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, : "m"(kShuf0), // %0 "m"(kShuf1), // %1 "m"(kShuf2) // %2 - ); + ); asm volatile( LABELALIGN @@ -521,7 +521,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 - ); + ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 @@ -530,7 +530,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 - ); + ); asm volatile( LABELALIGN @@ -587,7 +587,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 - ); + ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 @@ -596,7 +596,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 - ); + ); asm volatile( @@ -690,7 +690,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb1), // %1 "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 - ); + ); asm volatile( LABELALIGN @@ -734,7 +734,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 - ); + ); asm volatile( LABELALIGN @@ -1272,7 +1272,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 - ); + ); asm volatile( "movd %5,%%xmm2 \n" diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 46f5ba4cd..366b155ba 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -40,7 +40,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "q0", "q1" // Clobber List - ); + ); } // Read 32x1 average down and write 16x1. @@ -61,7 +61,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "q0", "q1" // Clobber List - ); + ); } // Read 32x2 average down and write 16x1. @@ -92,7 +92,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %3 : : "q0", "q1", "q2", "q3" // Clobber List - ); + ); } void ScaleRowDown4_NEON(const uint8_t* src_ptr, @@ -523,7 +523,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, "+r"(src_width) // %2 : : "memory", "cc", "q0", "q1", "q2" // Clobber List - ); + ); } // TODO(Yang Zhang): Investigate less load instructions for @@ -705,7 +705,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); + ); } // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! @@ -734,7 +734,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, "+r"(dst_width) // %2 : : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); + ); } void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index f4aed5fc9..0a7b80ce1 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -38,7 +38,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "v0", "v1" // Clobber List - ); + ); } // Read 32x1 average down and write 16x1. @@ -60,7 +60,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "v0", "v1" // Clobber List - ); + ); } // Read 32x2 average down and write 16x1. @@ -89,7 +89,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %3 : : "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void ScaleRowDown4_NEON(const uint8_t* src_ptr, @@ -534,7 +534,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr, "+r"(src_width) // %2 : : "memory", "cc", "v0", "v1", "v2" // Clobber List - ); + ); } // TODO(Yang Zhang): Investigate less load instructions for @@ -719,7 +719,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, "+r"(dst_width) // %2 : : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, @@ -742,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, "+r"(dst_width) // %2 : : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); + ); } void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, @@ -991,7 +991,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, "+r"(dst_width) // %3 : : "v0", "v1", "v2", "v3" // Clobber List - ); + ); } // Read 8x2 upsample with filtering and write 16x1. @@ -1041,7 +1041,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, "r"(14LL) // %5 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19" // Clobber List - ); + ); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 0fc6f873e..2e1670f2f 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -311,10 +311,10 @@ int I400ToNV21(const uint8_t* src_y, SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ @@ -329,21 +329,21 @@ int I400ToNV21(const uint8_t* src_y, } \ memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_uv_c, 2, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_uv_opt, 102, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ - dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ - dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ } \ int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ @@ -357,12 +357,12 @@ int I400ToNV21(const uint8_t* src_y, } \ EXPECT_LE(max_diff, 1); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ int abs_diff = \ abs(static_cast( \ - dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ + dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \ static_cast( \ - dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ + dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \ if (abs_diff > max_diff) { \ max_diff = abs_diff; \ } \ @@ -395,6 +395,99 @@ TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2) TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2) TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) +#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \ + OFF) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] = \ + (fastrand() & 0xff); \ + src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_uv_c, 2, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_uv_opt, 102, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_uv + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_uv + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \ + static_cast(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ + int abs_diff = \ + abs(static_cast( \ + dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \ + static_cast( \ + dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + } + +#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \ + TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \ + SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) + +// TODO(fbarchard): Fix msan on this unittest +// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) + #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ DOY) \ @@ -680,8 +773,8 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) -#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,\ - W1280, DIFF, N, NEG, OFF) \ +#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ + BPP_B, W1280, DIFF, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ @@ -740,15 +833,15 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) free_aligned_buffer_page_end(dst_argb32_opt); \ } -#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ - BPP_B, DIFF) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - benchmark_width_, DIFF, _Invert, -, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + DIFF) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, DIFF, _Opt, +, 0) TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2) @@ -980,6 +1073,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) +TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ @@ -1378,14 +1472,15 @@ TEST_F(LibYUVConvertTest, FuzzJpeg) { orig_pixels[0] = 0xff; orig_pixels[1] = 0xd8; // SOI. orig_pixels[kSize - 1] = 0xff; - ValidateJpeg(orig_pixels, kSize); // Failure normally expected. + ValidateJpeg(orig_pixels, + kSize); // Failure normally expected. free_aligned_buffer_page_end(orig_pixels); } } -// Test data created in GIMP. In export jpeg, disable thumbnails etc, -// choose a subsampling, and use low quality (50) to keep size small. -// Generated with xxd -i test.jpg +// Test data created in GIMP. In export jpeg, disable +// thumbnails etc, choose a subsampling, and use low quality +// (50) to keep size small. Generated with xxd -i test.jpg // test 0 is J400 static const uint8_t kTest0Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, @@ -1987,8 +2082,8 @@ TEST_F(LibYUVConvertTest, TestMJPGInfo) { EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen)); - EXPECT_EQ(1, - ShowJPegInfo(kTest4Jpg, kTest4JpgLen)); // Valid but unsupported. + EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg, + kTest4JpgLen)); // Valid but unsupported. } #endif // HAVE_JPEG @@ -2906,7 +3001,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) { } // Test 10 bit YUV to 10 bit RGB -// Caveat: Result is near due to float rounding in expected result. +// Caveat: Result is near due to float rounding in expected +// result. TEST_F(LibYUVConvertTest, TestH010ToAR30) { const int kSize = 1024; int histogram_b[1024]; @@ -2969,7 +3065,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) { } // Test 10 bit YUV to 10 bit RGB -// Caveat: Result is near due to float rounding in expected result. +// Caveat: Result is near due to float rounding in expected +// result. TEST_F(LibYUVConvertTest, TestH010ToAB30) { const int kSize = 1024; int histogram_b[1024]; diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index fcd073a2a..3a5029065 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3268,10 +3268,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { } float TestFloatDivToByte(int benchmark_width, - int benchmark_height, - int benchmark_iterations, - float scale, - bool opt) { + int benchmark_height, + int benchmark_iterations, + float scale, + bool opt) { int i, j; // NEON does multiple of 8, so round count up const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; @@ -3287,7 +3287,8 @@ float TestFloatDivToByte(int benchmark_width, // large values are problematic. audio is really -1 to 1. for (i = 0; i < kPixels; ++i) { (reinterpret_cast(src_weights))[i] = scale; - (reinterpret_cast(src_values))[i] = sinf(static_cast(i) * 0.1f); + (reinterpret_cast(src_values))[i] = + sinf(static_cast(i) * 0.1f); } memset(dst_out_c, 0, kPixels); memset(dst_out_opt, 1, kPixels); @@ -3295,24 +3296,24 @@ float TestFloatDivToByte(int benchmark_width, memset(dst_mask_opt, 3, kPixels); FloatDivToByteRow_C(reinterpret_cast(src_weights), - reinterpret_cast(src_values), - dst_out_c, dst_mask_c, kPixels); + reinterpret_cast(src_values), dst_out_c, + dst_mask_c, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_FLOATDIVTOBYTEROW_NEON FloatDivToByteRow_NEON(reinterpret_cast(src_weights), - reinterpret_cast(src_values), - dst_out_opt, dst_mask_opt, kPixels); + reinterpret_cast(src_values), dst_out_opt, + dst_mask_opt, kPixels); #else FloatDivToByteRow_C(reinterpret_cast(src_weights), - reinterpret_cast(src_values), - dst_out_opt, dst_mask_opt, kPixels); + reinterpret_cast(src_values), dst_out_opt, + dst_mask_opt, kPixels); #endif } else { FloatDivToByteRow_C(reinterpret_cast(src_weights), - reinterpret_cast(src_values), - dst_out_opt, dst_mask_opt, kPixels); + reinterpret_cast(src_values), dst_out_opt, + dst_mask_opt, kPixels); } } @@ -3347,5 +3348,23 @@ TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) { EXPECT_EQ(0, diff); } +TEST_F(LibYUVPlanarTest, UVToVURow) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_vu, kPixels * 2); + align_buffer_page_end(dst_pixels_uv, kPixels * 2); + + MemRandomize(src_pixels_vu, kPixels * 2); + memset(dst_pixels_uv, 1, kPixels * 2); + + UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels); + + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]); + EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]); + } + + free_aligned_buffer_page_end(src_pixels_vu); + free_aligned_buffer_page_end(dst_pixels_uv); +} } // namespace libyuv diff --git a/util/psnr.cc b/util/psnr.cc index f54015bab..c7bee7f97 100644 --- a/util/psnr.cc +++ b/util/psnr.cc @@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a, , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif - ); // NOLINT + ); // NOLINT return sse; } #endif // LIBYUV_DISABLE_X86 etc