From 8a13626e42f7fdcf3a6acbb0316760ee54cda7d8 Mon Sep 17 00:00:00 2001 From: Yuan Tong Date: Sat, 20 Mar 2021 23:22:08 +0800 Subject: [PATCH] Add MergeAR30Plane, MergeAR64Plane, MergeARGB16To8Plane These functions merge high bit depth planar RGB pixels into packed format. Change-Id: I506935a164b069e6b2fed8bf152cb874310c0916 Bug: libyuv:886, libyuv:889 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2780468 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/planar_functions.h | 54 +++++ include/libyuv/row.h | 193 +++++++++++++++++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 293 ++++++++++++++++++++++++-- source/row_any.cc | 98 ++++++++- source/row_common.cc | 108 ++++++++++ source/row_gcc.cc | 335 +++++++++++++++++++++++++++++- source/row_neon.cc | 264 +++++++++++++++++++++-- source/row_neon64.cc | 234 +++++++++++++++++++++ unit_test/planar_test.cc | 158 ++++++++++++++ 11 files changed, 1688 insertions(+), 53 deletions(-) diff --git a/README.chromium b/README.chromium index 8b2021ac3..aad5369e5 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1783 +Version: 1784 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 85dda98c1..def773cb4 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -229,6 +229,60 @@ void MergeARGBPlane(const uint8_t* src_r, int width, int height); +// Merge separate 'depth' bit R, G and B planes stored in lsb +// into one interleaved XR30 plane. +// depth should in range [10, 16] +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth); + +// Merge separate 'depth' bit R, G, B and A planes stored in lsb +// into one interleaved AR64 plane. +// src_a can be NULL to fill opaque value to alpha. +// depth should in range [1, 16] +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth); + +// Merge separate 'depth' bit R, G, B and A planes stored in lsb +// into one interleaved ARGB plane. +// src_a can be NULL to fill opaque value to alpha. +// depth should in range [8, 16] +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth); + // Copy I400. Supports inverting. LIBYUV_API int I400ToI400(const uint8_t* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9f159d40a..87ef32055 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -289,6 +289,7 @@ extern "C" { #define HAS_I410TOAR30ROW_SSSE3 #define HAS_I410TOARGBROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 +#define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_P210TOAR30ROW_SSSE3 @@ -300,6 +301,8 @@ extern "C" { #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 +#define HAS_SPLITXRGBROW_SSE2 +#define HAS_SPLITXRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 @@ -330,7 +333,13 @@ extern "C" { #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 +#define HAS_MERGEAR64ROW_AVX2 +#define HAS_MERGEARGB16TO8ROW_AVX2 #define HAS_MERGEARGBROW_AVX2 +#define HAS_MERGEXR30ROW_AVX2 +#define HAS_MERGEXR64ROW_AVX2 +#define HAS_MERGEXRGB16TO8ROW_AVX2 +#define HAS_MERGEXRGBROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I212TOAR30ROW_AVX2 @@ -350,6 +359,7 @@ extern "C" { #define HAS_MULTIPLYROW_16_AVX2 #define HAS_RGBATOYJROW_AVX2 #define HAS_SPLITARGBROW_AVX2 +#define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 // TODO(fbarchard): Fix AVX2 version of YUV24 @@ -423,7 +433,13 @@ extern "C" { #define HAS_I422TOYUY2ROW_NEON #define HAS_I444TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON +#define HAS_MERGEAR64ROW_NEON +#define HAS_MERGEARGB16TO8ROW_NEON #define HAS_MERGEARGBROW_NEON +#define HAS_MERGEXR30ROW_NEON +#define HAS_MERGEXR64ROW_NEON +#define HAS_MERGEXRGB16TO8ROW_NEON +#define HAS_MERGEXRGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_16_NEON #define HAS_MIRRORROW_NEON @@ -454,6 +470,7 @@ extern "C" { #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITARGBROW_NEON +#define HAS_SPLITXRGBROW_NEON #define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_16_NEON @@ -676,6 +693,7 @@ extern "C" { #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif +#define LIBYUV_NOINLINE __declspec(noinline) typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int32_t vec32[4]; typedef __declspec(align(16)) float vecf32[4]; @@ -696,6 +714,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32]; #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif +#define LIBYUV_NOINLINE __attribute__((noinline)) typedef int16_t __attribute__((vector_size(16))) vec16; typedef int32_t __attribute__((vector_size(16))) vec32; typedef float __attribute__((vector_size(16))) vecf32; @@ -711,6 +730,7 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32; typedef uint8_t __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var +#define LIBYUV_NOINLINE typedef int16_t vec16[8]; typedef int32_t vec32[4]; typedef float vecf32[4]; @@ -2061,6 +2081,179 @@ void SplitXRGBRow_Any_NEON(const uint8_t* src_argb, uint8_t* dst_b, int width); +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width); +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_Any_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_Any_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXR64Row_Any_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_Any_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXRGB16To8Row_Any_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR30Row_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeXR30Row_10_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width); +void MergeAR64Row_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width); +void MergeARGB16To8Row_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width); +void MergeXR64Row_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width); +void MergeXRGB16To8Row_Any_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width); + void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2f565a747..21c6bc4f0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1783 +#define LIBYUV_VERSION 1784 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 38287af02..3ff5dfa20 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1026,7 +1026,7 @@ void SplitARGBPlane(const uint8_t* src_argb, dst_stride_a = 0; } -#if defined(HAS_SPLITARGBROW_SSE2) +#if defined(HAS_SPLITXRGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitXRGBRow = SplitXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { @@ -1034,7 +1034,7 @@ void SplitARGBPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SPLITARGBROW_SSSE3) +#if defined(HAS_SPLITXRGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { SplitXRGBRow = SplitXRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { @@ -1042,7 +1042,7 @@ void SplitARGBPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SPLITARGBROW_AVX2) +#if defined(HAS_SPLITXRGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitXRGBRow = SplitXRGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { @@ -1050,7 +1050,7 @@ void SplitARGBPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SPLITRGBROW_NEON) +#if defined(HAS_SPLITXRGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitXRGBRow = SplitXRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -1112,7 +1112,7 @@ void SplitARGBPlane(const uint8_t* src_argb, } } #endif -#if defined(HAS_SPLITRGBROW_NEON) +#if defined(HAS_SPLITARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitARGBRow = SplitARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -1153,13 +1153,13 @@ void MergeARGBPlane(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) = MergeXRGBRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } if (src_a == NULL) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } // Coalesce rows. if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_argb == width * 4) { @@ -1167,7 +1167,7 @@ void MergeARGBPlane(const uint8_t* src_r, height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; } -#if defined(HAS_MERGEARGBROW_SSE2) +#if defined(HAS_MERGEXRGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeXRGBRow = MergeXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { @@ -1175,7 +1175,7 @@ void MergeARGBPlane(const uint8_t* src_r, } } #endif -#if defined(HAS_MERGEARGBROW_AVX2) +#if defined(HAS_MERGEXRGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeXRGBRow = MergeXRGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { @@ -1183,7 +1183,7 @@ void MergeARGBPlane(const uint8_t* src_r, } } #endif -#if defined(HAS_MERGERGBROW_NEON) +#if defined(HAS_MERGEXRGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeXRGBRow = MergeXRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -1200,12 +1200,6 @@ void MergeARGBPlane(const uint8_t* src_r, dst_argb += dst_stride_argb; } } else { - if (height < 0) { - height = -height; - dst_argb = dst_argb + (height - 1) * dst_stride_argb; - dst_stride_argb = -dst_stride_argb; - } - if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_argb == width * 4) { @@ -1230,7 +1224,7 @@ void MergeARGBPlane(const uint8_t* src_r, } } #endif -#if defined(HAS_MERGERGBROW_NEON) +#if defined(HAS_MERGEARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeARGBRow = MergeARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { @@ -1249,6 +1243,263 @@ void MergeARGBPlane(const uint8_t* src_r, } } +LIBYUV_API +void MergeXR30Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height, + int depth) { + int y; + void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_ar30, int depth, + int width) = MergeXR30Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && src_stride_b == width && + dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; + } +#if defined(HAS_MERGEXR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR30Row = MergeXR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR30Row = MergeXR30Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXR30ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (depth == 10) { + MergeXR30Row = MergeXR30Row_10_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_10_NEON; + } + } else { + MergeXR30Row = MergeXR30Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR30Row = MergeXR30Row_NEON; + } + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar30 += dst_stride_ar30; + } +} + +LIBYUV_API +void MergeAR64Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height, + int depth) { + int y; + void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint16_t* dst_argb, int depth, int width) = + MergeAR64Row_C; + void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint16_t* dst_argb, int depth, + int width) = MergeXR64Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; + dst_stride_ar64 = -dst_stride_ar64; + } + if (src_a == NULL) { + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && + src_stride_b == width && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEXR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXR64Row = MergeXR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXR64Row = MergeXR64Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXR64Row = MergeXR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXR64Row = MergeXR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar64 += dst_stride_ar64; + } + } else { + if (src_stride_r == width && src_stride_g == width && + src_stride_b == width && src_stride_a == width && + dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_ar64 = 0; + } +#if defined(HAS_MERGEAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeAR64Row = MergeAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeAR64Row = MergeAR64Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeAR64Row = MergeAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeAR64Row = MergeAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_ar64 += dst_stride_ar64; + } + } +} + +LIBYUV_API +void MergeARGB16To8Plane(const uint16_t* src_r, + int src_stride_r, + const uint16_t* src_g, + int src_stride_g, + const uint16_t* src_b, + int src_stride_b, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height, + int depth) { + int y; + void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, const uint16_t* src_a, + uint8_t* dst_argb, int depth, int width) = + MergeARGB16To8Row_C; + void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, + const uint16_t* src_b, uint8_t* dst_argb, int depth, + int width) = MergeXRGB16To8Row_C; + + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + if (src_a == NULL) { + // Coalesce rows. + if (src_stride_r == width && src_stride_g == width && + src_stride_b == width && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; + } +#if defined(HAS_MERGEXRGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEXRGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } + } else { + if (src_stride_r == width && src_stride_g == width && + src_stride_b == width && src_stride_a == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_r = src_stride_g = src_stride_b = src_stride_a = + dst_stride_argb = 0; + } +#if defined(HAS_MERGEARGB16TO8ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + MergeARGB16To8Row = MergeARGB16To8Row_AVX2; + } + } +#endif +#if defined(HAS_MERGEARGB16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + MergeARGB16To8Row = MergeARGB16To8Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); + src_r += src_stride_r; + src_g += src_stride_g; + src_b += src_stride_b; + dst_argb += dst_stride_argb; + } + } +} + // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, diff --git a/source/row_any.cc b/source/row_any.cc index b8c7f536e..1aebb3c07 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -183,6 +183,44 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2, #undef ANY41CT +// Any 4 planes to 1 plane with parameter +#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 4]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ + memcpy(dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEAR64ROW_AVX2 +ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15) +#endif + +#ifdef HAS_MERGEAR64ROW_NEON +ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 4, 7) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_NEON +ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) +#endif + +#undef ANY41PT + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -212,13 +250,13 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #ifdef HAS_MERGERGBROW_MMI ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) #endif -#ifdef HAS_MERGEARGBROW_SSE2 +#ifdef HAS_MERGEXRGBROW_SSE2 ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) #endif -#ifdef HAS_MERGEARGBROW_AVX2 +#ifdef HAS_MERGEXRGBROW_AVX2 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) #endif -#ifdef HAS_MERGEARGBROW_NEON +#ifdef HAS_MERGEXRGBROW_NEON ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_SSE2 @@ -424,6 +462,52 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #undef ANY31CT +// Any 3 planes to 1 plane with parameter +#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ + void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ + DTYPE* dst_ptr, int depth, int width) { \ + SIMD_ALIGNED(STYPE temp[16 * 3]); \ + SIMD_ALIGNED(DTYPE out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ + } \ + memcpy(temp, r_buf + n, r * SBPP); \ + memcpy(temp + 16, g_buf + n, r * SBPP); \ + memcpy(temp + 32, b_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ + memcpy(dst_ptr + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_MERGEXR30ROW_AVX2 +ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEXR30ROW_NEON +ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) +ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3) +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15) +#endif + +#ifdef HAS_MERGEXR64ROW_NEON +ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 4, 7) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_NEON +ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) +#endif + +#undef ANY31PT + // Any 2 planes to 1. #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ @@ -1711,16 +1795,16 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #ifdef HAS_SPLITRGBROW_MMI ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) #endif -#ifdef HAS_SPLITARGBROW_SSE2 +#ifdef HAS_SPLITXRGBROW_SSE2 ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) #endif -#ifdef HAS_SPLITARGBROW_SSSE3 +#ifdef HAS_SPLITXRGBROW_SSSE3 ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) #endif -#ifdef HAS_SPLITARGBROW_AVX2 +#ifdef HAS_SPLITXRGBROW_AVX2 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) #endif -#ifdef HAS_SPLITARGBROW_NEON +#ifdef HAS_SPLITXRGBROW_NEON ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 0e84961b3..3ccac51a8 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -56,6 +56,11 @@ static __inline int32_t clamp1023(int32_t v) { return (-(v >= 1023) | v) & 1023; } +// clamp to 2^n - 1 +static __inline int32_t clamp2nm1(int32_t v, int32_t max) { + return (-(v >= max) | v) & max; +} + static __inline uint32_t Abs(int32_t v) { int m = -(v < 0); return (v + m) ^ m; @@ -73,6 +78,10 @@ static __inline int32_t clamp1023(int32_t v) { return (v > 1023) ? 1023 : v; } +static __inline int32_t clamp2nm1(int32_t v, int32_t max) { + return (v > max) ? max : v; +} + static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } @@ -3010,6 +3019,105 @@ void MergeARGBRow_C(const uint8_t* src_r, } } +void MergeXR30Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + assert(depth >= 10); + assert(depth <= 16); + int x; + int shift = depth - 10; + uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; + for (x = 0; x < width; ++x) { + uint32_t r = clamp1023(src_r[x] >> shift); + uint32_t g = clamp1023(src_g[x] >> shift); + uint32_t b = clamp1023(src_b[x] >> shift); + dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; + } +} + +void MergeAR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; + dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; + dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; + dst_ar64[3] = clamp2nm1(src_a[x], max) << shift; + dst_ar64 += 4; + } +} + +void MergeARGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = clamp255(src_a[x] >> shift); + dst_argb += 4; + } +} + +void MergeXR64Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + assert(depth >= 1); + assert(depth <= 16); + int x; + int shift = 16 - depth; + int max = (1 << depth) - 1; + for (x = 0; x < width; ++x) { + dst_ar64[0] = clamp2nm1(src_b[x], max) << shift; + dst_ar64[1] = clamp2nm1(src_g[x], max) << shift; + dst_ar64[2] = clamp2nm1(src_r[x], max) << shift; + dst_ar64[3] = 0xffff; + dst_ar64 += 4; + } +} + +void MergeXRGB16To8Row_C(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + assert(depth >= 8); + assert(depth <= 16); + int x; + int shift = depth - 8; + for (x = 0; x < width; ++x) { + dst_argb[0] = clamp255(src_b[x] >> shift); + dst_argb[1] = clamp255(src_g[x] >> shift); + dst_argb[2] = clamp255(src_r[x] >> shift); + dst_argb[3] = 0xff; + dst_argb += 4; + } +} + void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 2591bcfee..3b63fe2d2 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5262,7 +5262,9 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_MERGEXRGBROW_SSE2 void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -5346,7 +5348,9 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_MERGEXRGBROW_AVX2 void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -5440,7 +5444,9 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } +#endif +#ifdef HAS_SPLITXRGBROW_SSE2 void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -5536,7 +5542,9 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } +#endif +#ifdef HAS_SPLITXRGBROW_SSSE3 void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -5628,7 +5636,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, "m"(kShuffleMaskARGBPermute) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } +#endif +#ifdef HAS_SPLITXRGBROW_AVX2 void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -5670,7 +5680,330 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, "+r"(width) // %4 : "m"(kShuffleMaskARGBSplit), // %5 "m"(kShuffleMaskARGBPermute) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_MERGEXR30ROW_AVX2 +void MergeXR30Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = depth - 10; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $6,%%ymm6,%%ymm6 \n" + "vmovd %5,%%xmm4 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu (%0,%1),%%ymm1 \n" + "vmovdqu (%0,%2),%%ymm2 \n" + "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" + "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit + "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB + "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" + "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG + "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" + "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit + "vpslld $0xa,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine + "vpor %%ymm2,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%3) \n" + "vmovdqu %%ymm3,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift) // %5 +#else + : "rm"(shift) // %5 +#endif + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif + +#ifdef HAS_MERGEAR64ROW_AVX2 +static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; +void MergeAR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vmovdqa %8,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + "vbroadcastss %7,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpermd %%ymm3,%%ymm5,%%ymm3 \n" + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%4) \n" + "vmovdqu %%ymm2,0x20(%4) \n" + "vmovdqu %%ymm4,0x40(%4) \n" + "vmovdqu %%ymm1,0x60(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 +#if defined(__i386__) + "+m"(width) // %5 + : "m"(shift), // %6 + "m"(mask), // %7 +#else + "+rm"(width) // %5 + : "rm"(shift), // %6 + "rm"(mask), // %7 +#endif + "m"(MergeAR64Permute) // %8 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEXR64ROW_AVX2 +void MergeXR64Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + mask = (mask << 16) + mask; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vmovdqa %7,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vbroadcastss %6,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" + "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" + "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" + "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" + "vpermd %%ymm0,%%ymm5,%%ymm0 \n" + "vpermd %%ymm1,%%ymm5,%%ymm1 \n" + "vpermd %%ymm2,%%ymm5,%%ymm2 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) + "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) + "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) + "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) + "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) + "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) + "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) + "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) + "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) + "vmovdqu %%ymm3,(%3) \n" + "vmovdqu %%ymm2,0x20(%3) \n" + "vmovdqu %%ymm4,0x40(%3) \n" + "vmovdqu %%ymm1,0x60(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x80(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift), // %5 + "m"(mask), // %6 +#else + : "rm"(shift), // %5 + "rm"(mask), // %6 +#endif + "m"(MergeAR64Permute) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_MERGEARGB16TO8ROW_AVX2 +static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, + 4, 12, 5, 13, 6, 14, 7, 15}; +void MergeARGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "vbroadcastf128 %7,%%ymm5 \n" + "vmovd %6,%%xmm6 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vmovdqu (%0,%3),%%ymm3 \n" // A + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%4) \n" + "vmovdqu %%ymm0,0x20(%4) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%4),%4 \n" + "subl $0x10,%5 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 +#if defined(__i386__) + "+m"(width) // %5 + : "m"(shift), // %6 +#else + "+rm"(width) // %5 + : "rm"(shift), // %6 +#endif + "m"(MergeARGB16To8Shuffle) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_MERGEXRGB16TO8ROW_AVX2 +void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = depth - 8; + asm volatile( + + "sub %0,%1 \n" + "sub %0,%2 \n" + "vbroadcastf128 %6,%%ymm5 \n" + "vmovd %5,%%xmm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // R + "vmovdqu (%0,%1),%%ymm1 \n" // G + "vmovdqu (%0,%2),%%ymm2 \n" // B + "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" + "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) + "vpermq $0xd8,%%ymm1,%%ymm1 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) + "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) + "vmovdqu %%ymm2,(%3) \n" + "vmovdqu %%ymm0,0x20(%3) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%3),%3 \n" + "subl $0x10,%4 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 +#if defined(__i386__) + : "m"(shift), // %5 +#else + : "rm"(shift), // %5 +#endif + "m"(MergeARGB16To8Shuffle) // %6 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/source/row_neon.cc b/source/row_neon.cc index 2165d0d01..ce0759f56 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -760,8 +760,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B "subs %4, %4, #16 \n" // 16 processed per loop - "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB - "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB + "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB + "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,6 +773,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, ); } +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "vmov.u32 q14, #1023 \n" + "vdup.32 q15, %5 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vshl.u32 q2, q2, q15 \n" // 000B + "vshl.u32 q1, q1, q15 \n" + "vshl.u32 q0, q0, q15 \n" + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "vmov.u32 q14, #1023 \n" + "1: \n" + "vld1.16 {d4}, [%2]! \n" // B + "vld1.16 {d2}, [%1]! \n" // G + "vld1.16 {d0}, [%0]! \n" // R + "vmovl.u16 q2, d4 \n" // 000B + "vmovl.u16 q1, d2 \n" // G + "vmovl.u16 q0, d0 \n" // R + "vmin.u32 q2, q2, q14 \n" + "vmin.u32 q1, q1, q14 \n" + "vmin.u32 q0, q0, q14 \n" + "vsli.u32 q2, q1, #10 \n" // 00GB + "vsli.u32 q2, q0, #20 \n" // 0RGB + "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) + "subs %4, %4, #4 \n" + "vst1.8 {q2}, [%3]! \n" + "bgt 1b \n" + "3: \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q14"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vdup.u16 q15, %6 \n" + "vdup.u16 q14, %7 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vmin.u16 q3, q3, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "subs %5, %5, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%4]! \n" + "vst4.16 {d1, d3, d5, d7}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "vmov.u8 q3, #0xff \n" // A (0xffff) + "vdup.u16 q15, %5 \n" + "vdup.u16 q14, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vmin.u16 q2, q2, q14 \n" + "vmin.u16 q1, q1, q14 \n" + "vmin.u16 q0, q0, q14 \n" + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "subs %4, %4, #8 \n" + "vst4.16 {d0, d2, d4, d6}, [%3]! \n" + "vst4.16 {d1, d3, d5, d7}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %6 \n" + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vld1.16 {q3}, [%3]! \n" // A + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vshl.u16 q3, q3, q15 \n" + "vqmovn.u16 d0, q0 \n" + "vqmovn.u16 d1, q1 \n" + "vqmovn.u16 d2, q2 \n" + "vqmovn.u16 d3, q3 \n" + "subs %5, %5, #8 \n" + "vst4.8 {d0, d1, d2, d3}, [%4]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "vdup.16 q15, %5 \n" + "vmov.u8 d6, #0xff \n" // A (0xff) + "1: \n" + "vld1.16 {q2}, [%0]! \n" // R + "vld1.16 {q1}, [%1]! \n" // G + "vld1.16 {q0}, [%2]! \n" // B + "vshl.u16 q2, q2, q15 \n" + "vshl.u16 q1, q1, q15 \n" + "vshl.u16 q0, q0, q15 \n" + "vqmovn.u16 d5, q2 \n" + "vqmovn.u16 d4, q1 \n" + "vqmovn.u16 d3, q0 \n" + "subs %4, %4, #8 \n" + "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); +} + // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 903bf5cd4..517d38aea 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -874,6 +874,240 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, ); } +void MergeXR30Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int depth, + int width) { + int shift = 10 - depth; + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "dup v31.4s, %w5 \n" + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "ushl v2.4s, v2.4s, v31.4s \n" // 000B + "ushl v1.4s, v1.4s, v31.4s \n" // G + "ushl v0.4s, v0.4s, v31.4s \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); +} + +void MergeXR30Row_10_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_ar30, + int /* depth */, + int width) { + asm volatile( + "movi v30.16b, #255 \n" + "ushr v30.4s, v30.4s, #22 \n" // 1023 + "1: \n" + "ldr d2, [%2], #8 \n" // B + "ldr d1, [%1], #8 \n" // G + "ldr d0, [%0], #8 \n" // R + "ushll v2.4s, v2.4h, #0 \n" // 000B + "ushll v1.4s, v1.4h, #0 \n" // G + "ushll v0.4s, v0.4h, #0 \n" // R + "umin v2.4s, v2.4s, v30.4s \n" + "umin v1.4s, v1.4s, v30.4s \n" + "umin v0.4s, v0.4s, v30.4s \n" + "sli v2.4s, v1.4s, #10 \n" // 00GB + "sli v2.4s, v0.4s, #20 \n" // 0RGB + "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) + "subs %w4, %w4, #4 \n" + "str q2, [%3], #16 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar30), // %3 + "+r"(width) // %4 + : + : "memory", "cc", "v0", "v1", "v2", "v30"); +} + +void MergeAR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "dup v30.8h, %w7 \n" + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "umin v3.8h, v3.8h, v30.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_ar64), // %4 + "+r"(width) // %5 + : "r"(shift), // %6 + "r"(mask) // %7 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXR64Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint16_t* dst_ar64, + int depth, + int width) { + int shift = 16 - depth; + int mask = (1 << depth) - 1; + asm volatile( + + "movi v3.16b, #0xff \n" // A (0xffff) + "dup v30.8h, %w6 \n" + "dup v31.8h, %w5 \n" + + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "umin v2.8h, v2.8h, v30.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "umin v1.8h, v1.8h, v30.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "umin v0.8h, v0.8h, v30.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v2.8h, v2.8h, v31.8h \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_ar64), // %3 + "+r"(width) // %4 + : "r"(shift), // %5 + "r"(mask) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeARGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + const uint16_t* src_a, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w6 \n" + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ldr q3, [%3], #16 \n" // A + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "ushl v3.8h, v3.8h, v31.8h \n" + "prfm pldl1keep, [%3, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "uqxtn v3.8b, v3.8h \n" + "subs %w5, %w5, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(src_a), // %3 + "+r"(dst_argb), // %4 + "+r"(width) // %5 + : "r"(shift) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + +void MergeXRGB16To8Row_NEON(const uint16_t* src_r, + const uint16_t* src_g, + const uint16_t* src_b, + uint8_t* dst_argb, + int depth, + int width) { + int shift = 8 - depth; + asm volatile( + + "dup v31.8h, %w5 \n" + "movi v3.8b, #0xff \n" // A (0xff) + "1: \n" + "ldr q2, [%0], #16 \n" // R + "ldr q1, [%1], #16 \n" // G + "ldr q0, [%2], #16 \n" // B + "ushl v2.8h, v2.8h, v31.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "ushl v1.8h, v1.8h, v31.8h \n" + "prfm pldl1keep, [%1, 448] \n" + "ushl v0.8h, v0.8h, v31.8h \n" + "prfm pldl1keep, [%2, 448] \n" + "uqxtn v2.8b, v2.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v0.8b, v0.8h \n" + "subs %w4, %w4, #8 \n" + "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_r), // %0 + "+r"(src_g), // %1 + "+r"(src_b), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(shift) // %5 + : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); +} + // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 6ecde87ba..3449c864f 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3091,6 +3091,164 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ + STYPE* src_pixels_a = reinterpret_cast(src_memory_a + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ + for (int i = 0; i < kPixels; ++i) { \ + src_pixels_r[i] = fastrand() & 65535; \ + src_pixels_g[i] = fastrand() & 65535; \ + src_pixels_b[i] = fastrand() & 65535; \ + src_pixels_a[i] = fastrand() & 65535; \ + } \ + memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \ + kWidth, NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \ + kWidth, NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(src_memory_a); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ + for (int i = 0; i < kPixels; ++i) { \ + src_pixels_r[i] = fastrand() & 65535; \ + src_pixels_g[i] = fastrand() & 65535; \ + src_pixels_b[i] = fastrand() & 65535; \ + } \ + memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, \ + 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) + +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10) +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12) +TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12) +TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16) + +#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ + TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \ + align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ + align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ + STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ + STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ + DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ + DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ + for (int i = 0; i < kPixels; ++i) { \ + src_pixels_r[i] = fastrand() & 65535; \ + src_pixels_g[i] = fastrand() & 65535; \ + src_pixels_b[i] = fastrand() & 65535; \ + } \ + memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ + MaskCpuFlags(disable_cpu_flags_); \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, dst_pixels_c, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ + kWidth, dst_pixels_opt, kWidth * 4, kWidth, \ + NEG benchmark_height_, DEPTH); \ + } \ + for (int i = 0; i < kPixels * 4; ++i) { \ + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_memory_r); \ + free_aligned_buffer_page_end(src_memory_g); \ + free_aligned_buffer_page_end(src_memory_b); \ + free_aligned_buffer_page_end(dst_memory_c); \ + free_aligned_buffer_page_end(dst_memory_opt); \ + } + +#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ + 1) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ + TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) + +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10) +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12) +TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) + // TODO(fbarchard): improve test for platforms and cpu detect #ifdef HAS_MERGEUVROW_16_AVX2 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {