Add MergeAR30Plane, MergeAR64Plane, MergeARGB16To8Plane

These functions merge high bit depth planar RGB pixels into packed format.

Change-Id: I506935a164b069e6b2fed8bf152cb874310c0916
Bug: libyuv:886, libyuv:889
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2780468
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-03-20 23:22:08 +08:00 committed by Frank Barchard
parent 2525698acb
commit 8a13626e42
11 changed files with 1688 additions and 53 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1783
Version: 1784
License: BSD
License File: LICENSE

View File

@ -229,6 +229,60 @@ void MergeARGBPlane(const uint8_t* src_r,
int width,
int height);
// Merge separate 'depth' bit R, G and B planes stored in lsb
// into one interleaved XR30 plane.
// depth should in range [10, 16]
LIBYUV_API
void MergeXR30Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
uint8_t* dst_ar30,
int dst_stride_ar30,
int width,
int height,
int depth);
// Merge separate 'depth' bit R, G, B and A planes stored in lsb
// into one interleaved AR64 plane.
// src_a can be NULL to fill opaque value to alpha.
// depth should in range [1, 16]
LIBYUV_API
void MergeAR64Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
const uint16_t* src_a,
int src_stride_a,
uint16_t* dst_ar64,
int dst_stride_ar64,
int width,
int height,
int depth);
// Merge separate 'depth' bit R, G, B and A planes stored in lsb
// into one interleaved ARGB plane.
// src_a can be NULL to fill opaque value to alpha.
// depth should in range [8, 16]
LIBYUV_API
void MergeARGB16To8Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
const uint16_t* src_a,
int src_stride_a,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
int depth);
// Copy I400. Supports inverting.
LIBYUV_API
int I400ToI400(const uint8_t* src_y,

View File

@ -289,6 +289,7 @@ extern "C" {
#define HAS_I410TOAR30ROW_SSSE3
#define HAS_I410TOARGBROW_SSSE3
#define HAS_MERGEARGBROW_SSE2
#define HAS_MERGEXRGBROW_SSE2
#define HAS_MERGERGBROW_SSSE3
#define HAS_MIRRORUVROW_SSSE3
#define HAS_P210TOAR30ROW_SSSE3
@ -300,6 +301,8 @@ extern "C" {
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITARGBROW_SSE2
#define HAS_SPLITARGBROW_SSSE3
#define HAS_SPLITXRGBROW_SSE2
#define HAS_SPLITXRGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
@ -330,7 +333,13 @@ extern "C" {
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_DIVIDEROW_16_AVX2
#define HAS_HALFMERGEUVROW_AVX2
#define HAS_MERGEAR64ROW_AVX2
#define HAS_MERGEARGB16TO8ROW_AVX2
#define HAS_MERGEARGBROW_AVX2
#define HAS_MERGEXR30ROW_AVX2
#define HAS_MERGEXR64ROW_AVX2
#define HAS_MERGEXRGB16TO8ROW_AVX2
#define HAS_MERGEXRGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I212TOAR30ROW_AVX2
@ -350,6 +359,7 @@ extern "C" {
#define HAS_MULTIPLYROW_16_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_SPLITARGBROW_AVX2
#define HAS_SPLITXRGBROW_AVX2
#define HAS_SPLITUVROW_16_AVX2
#define HAS_SWAPUVROW_AVX2
// TODO(fbarchard): Fix AVX2 version of YUV24
@ -423,7 +433,13 @@ extern "C" {
#define HAS_I422TOYUY2ROW_NEON
#define HAS_I444TOARGBROW_NEON
#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEAR64ROW_NEON
#define HAS_MERGEARGB16TO8ROW_NEON
#define HAS_MERGEARGBROW_NEON
#define HAS_MERGEXR30ROW_NEON
#define HAS_MERGEXR64ROW_NEON
#define HAS_MERGEXRGB16TO8ROW_NEON
#define HAS_MERGEXRGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MERGEUVROW_16_NEON
#define HAS_MIRRORROW_NEON
@ -454,6 +470,7 @@ extern "C" {
#define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON
#define HAS_SPLITARGBROW_NEON
#define HAS_SPLITXRGBROW_NEON
#define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON
#define HAS_SPLITUVROW_16_NEON
@ -676,6 +693,7 @@ extern "C" {
#else
#define SIMD_ALIGNED(var) __declspec(align(16)) var
#endif
#define LIBYUV_NOINLINE __declspec(noinline)
typedef __declspec(align(16)) int16_t vec16[8];
typedef __declspec(align(16)) int32_t vec32[4];
typedef __declspec(align(16)) float vecf32[4];
@ -696,6 +714,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
#else
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
#endif
#define LIBYUV_NOINLINE __attribute__((noinline))
typedef int16_t __attribute__((vector_size(16))) vec16;
typedef int32_t __attribute__((vector_size(16))) vec32;
typedef float __attribute__((vector_size(16))) vecf32;
@ -711,6 +730,7 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32;
typedef uint8_t __attribute__((vector_size(32))) ulvec8;
#else
#define SIMD_ALIGNED(var) var
#define LIBYUV_NOINLINE
typedef int16_t vec16[8];
typedef int32_t vec32[4];
typedef float vecf32[4];
@ -2061,6 +2081,179 @@ void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
uint8_t* dst_b,
int width);
void MergeXR30Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeAR64Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width);
void MergeARGB16To8Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR64Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width);
void MergeXRGB16To8Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR30Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeAR64Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width);
void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR64Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width);
void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR30Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeXR30Row_10_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int /* depth */,
int width);
void MergeAR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width);
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width);
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR30Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeAR64Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width);
void MergeXR64Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width);
void MergeARGB16To8Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width);
void MergeXRGB16To8Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR30Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeXR30Row_10_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width);
void MergeAR64Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width);
void MergeARGB16To8Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR64Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width);
void MergeXRGB16To8Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width);
void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1783
#define LIBYUV_VERSION 1784
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -1026,7 +1026,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
dst_stride_a = 0;
}
#if defined(HAS_SPLITARGBROW_SSE2)
#if defined(HAS_SPLITXRGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitXRGBRow = SplitXRGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
@ -1034,7 +1034,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_SPLITARGBROW_SSSE3)
#if defined(HAS_SPLITXRGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
@ -1042,7 +1042,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_SPLITARGBROW_AVX2)
#if defined(HAS_SPLITXRGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitXRGBRow = SplitXRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
@ -1050,7 +1050,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
#if defined(HAS_SPLITXRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitXRGBRow = SplitXRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
@ -1112,7 +1112,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
#if defined(HAS_SPLITARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitARGBRow = SplitARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
@ -1153,13 +1153,13 @@ void MergeARGBPlane(const uint8_t* src_r,
const uint8_t* src_b, uint8_t* dst_argb, int width) =
MergeXRGBRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
if (src_a == NULL) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && dst_stride_argb == width * 4) {
@ -1167,7 +1167,7 @@ void MergeARGBPlane(const uint8_t* src_r,
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
}
#if defined(HAS_MERGEARGBROW_SSE2)
#if defined(HAS_MERGEXRGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeXRGBRow = MergeXRGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
@ -1175,7 +1175,7 @@ void MergeARGBPlane(const uint8_t* src_r,
}
}
#endif
#if defined(HAS_MERGEARGBROW_AVX2)
#if defined(HAS_MERGEXRGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeXRGBRow = MergeXRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
@ -1183,7 +1183,7 @@ void MergeARGBPlane(const uint8_t* src_r,
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
#if defined(HAS_MERGEXRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeXRGBRow = MergeXRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
@ -1200,12 +1200,6 @@ void MergeARGBPlane(const uint8_t* src_r,
dst_argb += dst_stride_argb;
}
} else {
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && src_stride_a == width &&
dst_stride_argb == width * 4) {
@ -1230,7 +1224,7 @@ void MergeARGBPlane(const uint8_t* src_r,
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
#if defined(HAS_MERGEARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeARGBRow = MergeARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
@ -1249,6 +1243,263 @@ void MergeARGBPlane(const uint8_t* src_r,
}
}
LIBYUV_API
void MergeXR30Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
uint8_t* dst_ar30,
int dst_stride_ar30,
int width,
int height,
int depth) {
int y;
void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
const uint16_t* src_b, uint8_t* dst_ar30, int depth,
int width) = MergeXR30Row_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
dst_stride_ar30 = -dst_stride_ar30;
}
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
dst_stride_ar30 == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
}
#if defined(HAS_MERGEXR30ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeXR30Row = MergeXR30Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeXR30Row = MergeXR30Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEXR30ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
if (depth == 10) {
MergeXR30Row = MergeXR30Row_10_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeXR30Row = MergeXR30Row_10_NEON;
}
} else {
MergeXR30Row = MergeXR30Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeXR30Row = MergeXR30Row_NEON;
}
}
}
#endif
for (y = 0; y < height; ++y) {
MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_ar30 += dst_stride_ar30;
}
}
LIBYUV_API
void MergeAR64Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
const uint16_t* src_a,
int src_stride_a,
uint16_t* dst_ar64,
int dst_stride_ar64,
int width,
int height,
int depth) {
int y;
void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
const uint16_t* src_b, const uint16_t* src_a,
uint16_t* dst_argb, int depth, int width) =
MergeAR64Row_C;
void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
const uint16_t* src_b, uint16_t* dst_argb, int depth,
int width) = MergeXR64Row_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
dst_stride_ar64 = -dst_stride_ar64;
}
if (src_a == NULL) {
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && dst_stride_ar64 == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
}
#if defined(HAS_MERGEXR64ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeXR64Row = MergeXR64Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeXR64Row = MergeXR64Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEXR64ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeXR64Row = MergeXR64Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeXR64Row = MergeXR64Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_ar64 += dst_stride_ar64;
}
} else {
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && src_stride_a == width &&
dst_stride_ar64 == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = src_stride_a =
dst_stride_ar64 = 0;
}
#if defined(HAS_MERGEAR64ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeAR64Row = MergeAR64Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeAR64Row = MergeAR64Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEAR64ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeAR64Row = MergeAR64Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeAR64Row = MergeAR64Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_ar64 += dst_stride_ar64;
}
}
}
LIBYUV_API
void MergeARGB16To8Plane(const uint16_t* src_r,
int src_stride_r,
const uint16_t* src_g,
int src_stride_g,
const uint16_t* src_b,
int src_stride_b,
const uint16_t* src_a,
int src_stride_a,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height,
int depth) {
int y;
void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
const uint16_t* src_b, const uint16_t* src_a,
uint8_t* dst_argb, int depth, int width) =
MergeARGB16To8Row_C;
void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
const uint16_t* src_b, uint8_t* dst_argb, int depth,
int width) = MergeXRGB16To8Row_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
if (src_a == NULL) {
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
}
#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_argb += dst_stride_argb;
}
} else {
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && src_stride_a == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = src_stride_a =
dst_stride_argb = 0;
}
#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
}
}
#endif
#if defined(HAS_MERGEARGB16TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeARGB16To8Row = MergeARGB16To8Row_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_argb += dst_stride_argb;
}
}
}
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,

View File

@ -183,6 +183,44 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2,
#undef ANY41CT
// Any 4 planes to 1 plane with parameter
#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
SIMD_ALIGNED(STYPE temp[16 * 4]); \
SIMD_ALIGNED(DTYPE out[64]); \
memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \
} \
memcpy(temp, r_buf + n, r * SBPP); \
memcpy(temp + 16, g_buf + n, r * SBPP); \
memcpy(temp + 32, b_buf + n, r * SBPP); \
memcpy(temp + 48, a_buf + n, r * SBPP); \
ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
memcpy(dst_ptr + n * BPP, out, r * BPP); \
}
#ifdef HAS_MERGEAR64ROW_AVX2
ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15)
#endif
#ifdef HAS_MERGEAR64ROW_NEON
ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
#endif
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
#endif
#ifdef HAS_MERGEARGB16TO8ROW_NEON
ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
#endif
#undef ANY41PT
// Any 3 planes to 1.
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
@ -212,13 +250,13 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#ifdef HAS_MERGERGBROW_MMI
ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
#endif
#ifdef HAS_MERGEARGBROW_SSE2
#ifdef HAS_MERGEXRGBROW_SSE2
ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
#endif
#ifdef HAS_MERGEARGBROW_AVX2
#ifdef HAS_MERGEXRGBROW_AVX2
ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_MERGEARGBROW_NEON
#ifdef HAS_MERGEXRGBROW_NEON
ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_SSE2
@ -424,6 +462,52 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
#endif
#undef ANY31CT
// Any 3 planes to 1 plane with parameter
#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
DTYPE* dst_ptr, int depth, int width) { \
SIMD_ALIGNED(STYPE temp[16 * 3]); \
SIMD_ALIGNED(DTYPE out[64]); \
memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \
} \
memcpy(temp, r_buf + n, r * SBPP); \
memcpy(temp + 16, g_buf + n, r * SBPP); \
memcpy(temp + 32, b_buf + n, r * SBPP); \
ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \
memcpy(dst_ptr + n * BPP, out, r * BPP); \
}
#ifdef HAS_MERGEXR30ROW_AVX2
ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
#endif
#ifdef HAS_MERGEXR30ROW_NEON
ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3)
#endif
#ifdef HAS_MERGEXR64ROW_AVX2
ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15)
#endif
#ifdef HAS_MERGEXR64ROW_NEON
ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
#endif
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
#endif
#ifdef HAS_MERGEXRGB16TO8ROW_NEON
ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
#endif
#undef ANY31PT
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
@ -1711,16 +1795,16 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#ifdef HAS_SPLITRGBROW_MMI
ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#endif
#ifdef HAS_SPLITARGBROW_SSE2
#ifdef HAS_SPLITXRGBROW_SSE2
ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_SSSE3
#ifdef HAS_SPLITXRGBROW_SSSE3
ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_AVX2
#ifdef HAS_SPLITXRGBROW_AVX2
ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
#endif
#ifdef HAS_SPLITARGBROW_NEON
#ifdef HAS_SPLITXRGBROW_NEON
ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
#endif

View File

@ -56,6 +56,11 @@ static __inline int32_t clamp1023(int32_t v) {
return (-(v >= 1023) | v) & 1023;
}
// clamp to 2^n - 1
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
return (-(v >= max) | v) & max;
}
static __inline uint32_t Abs(int32_t v) {
int m = -(v < 0);
return (v + m) ^ m;
@ -73,6 +78,10 @@ static __inline int32_t clamp1023(int32_t v) {
return (v > 1023) ? 1023 : v;
}
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
return (v > max) ? max : v;
}
static __inline uint32_t Abs(int32_t v) {
return (v < 0) ? -v : v;
}
@ -3010,6 +3019,105 @@ void MergeARGBRow_C(const uint8_t* src_r,
}
}
void MergeXR30Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width) {
assert(depth >= 10);
assert(depth <= 16);
int x;
int shift = depth - 10;
uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
for (x = 0; x < width; ++x) {
uint32_t r = clamp1023(src_r[x] >> shift);
uint32_t g = clamp1023(src_g[x] >> shift);
uint32_t b = clamp1023(src_b[x] >> shift);
dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
}
}
void MergeAR64Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width) {
assert(depth >= 1);
assert(depth <= 16);
int x;
int shift = 16 - depth;
int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
dst_ar64[3] = clamp2nm1(src_a[x], max) << shift;
dst_ar64 += 4;
}
}
void MergeARGB16To8Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width) {
assert(depth >= 8);
assert(depth <= 16);
int x;
int shift = depth - 8;
for (x = 0; x < width; ++x) {
dst_argb[0] = clamp255(src_b[x] >> shift);
dst_argb[1] = clamp255(src_g[x] >> shift);
dst_argb[2] = clamp255(src_r[x] >> shift);
dst_argb[3] = clamp255(src_a[x] >> shift);
dst_argb += 4;
}
}
void MergeXR64Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width) {
assert(depth >= 1);
assert(depth <= 16);
int x;
int shift = 16 - depth;
int max = (1 << depth) - 1;
for (x = 0; x < width; ++x) {
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
dst_ar64[3] = 0xffff;
dst_ar64 += 4;
}
}
void MergeXRGB16To8Row_C(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width) {
assert(depth >= 8);
assert(depth <= 16);
int x;
int shift = depth - 8;
for (x = 0; x < width; ++x) {
dst_argb[0] = clamp255(src_b[x] >> shift);
dst_argb[1] = clamp255(src_g[x] >> shift);
dst_argb[2] = clamp255(src_r[x] >> shift);
dst_argb[3] = 0xff;
dst_argb += 4;
}
}
void SplitXRGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,

View File

@ -5262,7 +5262,9 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
#ifdef HAS_MERGEXRGBROW_SSE2
void MergeXRGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
@ -5346,7 +5348,9 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
#ifdef HAS_MERGEXRGBROW_AVX2
void MergeXRGBRow_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
@ -5440,7 +5444,9 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
#ifdef HAS_SPLITXRGBROW_SSE2
void SplitXRGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -5536,7 +5542,9 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
: "m"(kShuffleMaskARGBSplit) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif
#ifdef HAS_SPLITXRGBROW_SSSE3
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -5628,7 +5636,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
"m"(kShuffleMaskARGBPermute) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SPLITXRGBROW_AVX2
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -5670,7 +5680,330 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit), // %5
"m"(kShuffleMaskARGBPermute) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_MERGEXR30ROW_AVX2
void MergeXR30Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width) {
int shift = depth - 10;
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
"vpsrlw $14,%%ymm5,%%ymm5 \n"
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $6,%%ymm6,%%ymm6 \n"
"vmovd %5,%%xmm4 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1),%%ymm1 \n"
"vmovdqu (%0,%2),%%ymm2 \n"
"vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
"vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
"vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
"vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
"vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
"vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
"vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
"vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
"vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
"vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
"vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
"vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
"vpslld $0xa,%%ymm2,%%ymm2 \n"
"vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
"vpor %%ymm2,%%ymm3,%%ymm3 \n"
"vmovdqu %%ymm0,(%3) \n"
"vmovdqu %%ymm3,0x20(%3) \n"
"lea 0x20(%0),%0 \n"
"lea 0x40(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
#if defined(__i386__)
: "m"(shift) // %5
#else
: "rm"(shift) // %5
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif
#ifdef HAS_MERGEAR64ROW_AVX2
static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
void MergeAR64Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
mask = (mask << 16) + mask;
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"sub %0,%3 \n"
"vmovdqa %8,%%ymm5 \n"
"vmovd %6,%%xmm6 \n"
"vbroadcastss %7,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // R
"vmovdqu (%0,%1),%%ymm1 \n" // G
"vmovdqu (%0,%2),%%ymm2 \n" // B
"vmovdqu (%0,%3),%%ymm3 \n" // A
"vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
"vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
"vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
"vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
"vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
"vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
"vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
"vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
"vpermd %%ymm0,%%ymm5,%%ymm0 \n"
"vpermd %%ymm1,%%ymm5,%%ymm1 \n"
"vpermd %%ymm2,%%ymm5,%%ymm2 \n"
"vpermd %%ymm3,%%ymm5,%%ymm3 \n"
"vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
"vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
"vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
"vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
"vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
"vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
"vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
"vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
"vmovdqu %%ymm3,(%4) \n"
"vmovdqu %%ymm2,0x20(%4) \n"
"vmovdqu %%ymm4,0x40(%4) \n"
"vmovdqu %%ymm1,0x60(%4) \n"
"lea 0x20(%0),%0 \n"
"lea 0x80(%4),%4 \n"
"subl $0x10,%5 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_ar64), // %4
#if defined(__i386__)
"+m"(width) // %5
: "m"(shift), // %6
"m"(mask), // %7
#else
"+rm"(width) // %5
: "rm"(shift), // %6
"rm"(mask), // %7
#endif
"m"(MergeAR64Permute) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_MERGEXR64ROW_AVX2
void MergeXR64Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
mask = (mask << 16) + mask;
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"vmovdqa %7,%%ymm5 \n"
"vmovd %5,%%xmm6 \n"
"vbroadcastss %6,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // R
"vmovdqu (%0,%1),%%ymm1 \n" // G
"vmovdqu (%0,%2),%%ymm2 \n" // B
"vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
"vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
"vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
"vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
"vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
"vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
"vpermd %%ymm0,%%ymm5,%%ymm0 \n"
"vpermd %%ymm1,%%ymm5,%%ymm1 \n"
"vpermd %%ymm2,%%ymm5,%%ymm2 \n"
"vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
"vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
"vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
"vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
"vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
"vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
"vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
"vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
"vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
"vmovdqu %%ymm3,(%3) \n"
"vmovdqu %%ymm2,0x20(%3) \n"
"vmovdqu %%ymm4,0x40(%3) \n"
"vmovdqu %%ymm1,0x60(%3) \n"
"lea 0x20(%0),%0 \n"
"lea 0x80(%3),%3 \n"
"subl $0x10,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar64), // %3
"+r"(width) // %4
#if defined(__i386__)
: "m"(shift), // %5
"m"(mask), // %6
#else
: "rm"(shift), // %5
"rm"(mask), // %6
#endif
"m"(MergeAR64Permute) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
4, 12, 5, 13, 6, 14, 7, 15};
void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width) {
int shift = depth - 8;
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"sub %0,%3 \n"
"vbroadcastf128 %7,%%ymm5 \n"
"vmovd %6,%%xmm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // R
"vmovdqu (%0,%1),%%ymm1 \n" // G
"vmovdqu (%0,%2),%%ymm2 \n" // B
"vmovdqu (%0,%3),%%ymm3 \n" // A
"vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
"vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
"vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
"vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
"vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
"vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
"vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
"vmovdqu %%ymm2,(%4) \n"
"vmovdqu %%ymm0,0x20(%4) \n"
"lea 0x20(%0),%0 \n"
"lea 0x40(%4),%4 \n"
"subl $0x10,%5 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
#if defined(__i386__)
"+m"(width) // %5
: "m"(shift), // %6
#else
"+rm"(width) // %5
: "rm"(shift), // %6
#endif
"m"(MergeARGB16To8Shuffle) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width) {
int shift = depth - 8;
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"vbroadcastf128 %6,%%ymm5 \n"
"vmovd %5,%%xmm6 \n"
"vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
"vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // R
"vmovdqu (%0,%1),%%ymm1 \n" // G
"vmovdqu (%0,%2),%%ymm2 \n" // B
"vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
"vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
"vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
"vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
"vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
"vmovdqu %%ymm2,(%3) \n"
"vmovdqu %%ymm0,0x20(%3) \n"
"lea 0x20(%0),%0 \n"
"lea 0x40(%3),%3 \n"
"subl $0x10,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
#if defined(__i386__)
: "m"(shift), // %5
#else
: "rm"(shift), // %5
#endif
"m"(MergeARGB16To8Shuffle) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif

View File

@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@ -760,8 +760,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
"vld1.8 {q1}, [%1]! \n" // load G
"vld1.8 {q0}, [%2]! \n" // load B
"subs %4, %4, #16 \n" // 16 processed per loop
"vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
"vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
"vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB
"vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
@ -773,6 +773,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
);
}
void MergeXR30Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width) {
int shift = 10 - depth;
asm volatile(
"vmov.u32 q14, #1023 \n"
"vdup.32 q15, %5 \n"
"1: \n"
"vld1.16 {d4}, [%2]! \n" // B
"vld1.16 {d2}, [%1]! \n" // G
"vld1.16 {d0}, [%0]! \n" // R
"vmovl.u16 q2, d4 \n" // B
"vmovl.u16 q1, d2 \n" // G
"vmovl.u16 q0, d0 \n" // R
"vshl.u32 q2, q2, q15 \n" // 000B
"vshl.u32 q1, q1, q15 \n"
"vshl.u32 q0, q0, q15 \n"
"vmin.u32 q2, q2, q14 \n"
"vmin.u32 q1, q1, q14 \n"
"vmin.u32 q0, q0, q14 \n"
"vsli.u32 q2, q1, #10 \n" // 00GB
"vsli.u32 q2, q0, #20 \n" // 0RGB
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
"subs %4, %4, #4 \n"
"vst1.8 {q2}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "q0", "q1", "q2", "q14", "q15");
}
void MergeXR30Row_10_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int /* depth */,
int width) {
asm volatile(
"vmov.u32 q14, #1023 \n"
"1: \n"
"vld1.16 {d4}, [%2]! \n" // B
"vld1.16 {d2}, [%1]! \n" // G
"vld1.16 {d0}, [%0]! \n" // R
"vmovl.u16 q2, d4 \n" // 000B
"vmovl.u16 q1, d2 \n" // G
"vmovl.u16 q0, d0 \n" // R
"vmin.u32 q2, q2, q14 \n"
"vmin.u32 q1, q1, q14 \n"
"vmin.u32 q0, q0, q14 \n"
"vsli.u32 q2, q1, #10 \n" // 00GB
"vsli.u32 q2, q0, #20 \n" // 0RGB
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
"subs %4, %4, #4 \n"
"vst1.8 {q2}, [%3]! \n"
"bgt 1b \n"
"3: \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
:
: "memory", "cc", "q0", "q1", "q2", "q14");
}
void MergeAR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile(
"vdup.u16 q15, %6 \n"
"vdup.u16 q14, %7 \n"
"1: \n"
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vld1.16 {q3}, [%3]! \n" // A
"vmin.u16 q2, q2, q14 \n"
"vmin.u16 q1, q1, q14 \n"
"vmin.u16 q0, q0, q14 \n"
"vmin.u16 q3, q3, q14 \n"
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"vshl.u16 q3, q3, q15 \n"
"subs %5, %5, #8 \n"
"vst4.16 {d0, d2, d4, d6}, [%4]! \n"
"vst4.16 {d1, d3, d5, d7}, [%4]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_ar64), // %4
"+r"(width) // %5
: "r"(shift), // %6
"r"(mask) // %7
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
}
void MergeXR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile(
"vmov.u8 q3, #0xff \n" // A (0xffff)
"vdup.u16 q15, %5 \n"
"vdup.u16 q14, %6 \n"
"1: \n"
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vmin.u16 q2, q2, q14 \n"
"vmin.u16 q1, q1, q14 \n"
"vmin.u16 q0, q0, q14 \n"
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"subs %4, %4, #8 \n"
"vst4.16 {d0, d2, d4, d6}, [%3]! \n"
"vst4.16 {d1, d3, d5, d7}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar64), // %3
"+r"(width) // %4
: "r"(shift), // %5
"r"(mask) // %6
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
}
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width) {
int shift = 8 - depth;
asm volatile(
"vdup.16 q15, %6 \n"
"1: \n"
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vld1.16 {q3}, [%3]! \n" // A
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"vshl.u16 q3, q3, q15 \n"
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d1, q1 \n"
"vqmovn.u16 d2, q2 \n"
"vqmovn.u16 d3, q3 \n"
"subs %5, %5, #8 \n"
"vst4.8 {d0, d1, d2, d3}, [%4]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: "r"(shift) // %6
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
}
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width) {
int shift = 8 - depth;
asm volatile(
"vdup.16 q15, %5 \n"
"vmov.u8 d6, #0xff \n" // A (0xff)
"1: \n"
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"vqmovn.u16 d5, q2 \n"
"vqmovn.u16 d4, q1 \n"
"vqmovn.u16 d3, q0 \n"
"subs %4, %4, #8 \n"
"vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "q0", "q1", "q2", "d6", "q15");
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(

View File

@ -874,6 +874,240 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
);
}
void MergeXR30Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int depth,
int width) {
int shift = 10 - depth;
asm volatile(
"movi v30.16b, #255 \n"
"ushr v30.4s, v30.4s, #22 \n" // 1023
"dup v31.4s, %w5 \n"
"1: \n"
"ldr d2, [%2], #8 \n" // B
"ldr d1, [%1], #8 \n" // G
"ldr d0, [%0], #8 \n" // R
"ushll v2.4s, v2.4h, #0 \n" // B
"ushll v1.4s, v1.4h, #0 \n" // G
"ushll v0.4s, v0.4h, #0 \n" // R
"ushl v2.4s, v2.4s, v31.4s \n" // 000B
"ushl v1.4s, v1.4s, v31.4s \n" // G
"ushl v0.4s, v0.4s, v31.4s \n" // R
"umin v2.4s, v2.4s, v30.4s \n"
"umin v1.4s, v1.4s, v30.4s \n"
"umin v0.4s, v0.4s, v30.4s \n"
"sli v2.4s, v1.4s, #10 \n" // 00GB
"sli v2.4s, v0.4s, #20 \n" // 0RGB
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
"subs %w4, %w4, #4 \n"
"str q2, [%3], #16 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "v0", "v1", "v2", "v30", "v31");
}
void MergeXR30Row_10_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
int /* depth */,
int width) {
asm volatile(
"movi v30.16b, #255 \n"
"ushr v30.4s, v30.4s, #22 \n" // 1023
"1: \n"
"ldr d2, [%2], #8 \n" // B
"ldr d1, [%1], #8 \n" // G
"ldr d0, [%0], #8 \n" // R
"ushll v2.4s, v2.4h, #0 \n" // 000B
"ushll v1.4s, v1.4h, #0 \n" // G
"ushll v0.4s, v0.4h, #0 \n" // R
"umin v2.4s, v2.4s, v30.4s \n"
"umin v1.4s, v1.4s, v30.4s \n"
"umin v0.4s, v0.4s, v30.4s \n"
"sli v2.4s, v1.4s, #10 \n" // 00GB
"sli v2.4s, v0.4s, #20 \n" // 0RGB
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
"subs %w4, %w4, #4 \n"
"str q2, [%3], #16 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
:
: "memory", "cc", "v0", "v1", "v2", "v30");
}
void MergeAR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile(
"dup v30.8h, %w7 \n"
"dup v31.8h, %w6 \n"
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"ldr q3, [%3], #16 \n" // A
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"umin v0.8h, v0.8h, v30.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"umin v3.8h, v3.8h, v30.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"ushl v2.8h, v2.8h, v31.8h \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"ushl v3.8h, v3.8h, v31.8h \n"
"subs %w5, %w5, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_ar64), // %4
"+r"(width) // %5
: "r"(shift), // %6
"r"(mask) // %7
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeXR64Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
int depth,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile(
"movi v3.16b, #0xff \n" // A (0xffff)
"dup v30.8h, %w6 \n"
"dup v31.8h, %w5 \n"
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"umin v0.8h, v0.8h, v30.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"ushl v2.8h, v2.8h, v31.8h \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"subs %w4, %w4, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_ar64), // %3
"+r"(width) // %4
: "r"(shift), // %5
"r"(mask) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
int depth,
int width) {
int shift = 8 - depth;
asm volatile(
"dup v31.8h, %w6 \n"
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"ldr q3, [%3], #16 \n" // A
"ushl v2.8h, v2.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"ushl v3.8h, v3.8h, v31.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"uqxtn v2.8b, v2.8h \n"
"uqxtn v1.8b, v1.8h \n"
"uqxtn v0.8b, v0.8h \n"
"uqxtn v3.8b, v3.8h \n"
"subs %w5, %w5, #8 \n"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: "r"(shift) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
int depth,
int width) {
int shift = 8 - depth;
asm volatile(
"dup v31.8h, %w5 \n"
"movi v3.8b, #0xff \n" // A (0xff)
"1: \n"
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"ushl v2.8h, v2.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"uqxtn v2.8b, v2.8h \n"
"uqxtn v1.8b, v1.8h \n"
"uqxtn v0.8b, v0.8h \n"
"subs %w4, %w4, #8 \n"
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
}
// Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(

View File

@ -3091,6 +3091,164 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
src_pixels_a[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \
kWidth, NEG benchmark_height_, DEPTH); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \
kWidth, NEG benchmark_height_, DEPTH); \
} \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
} \
free_aligned_buffer_page_end(src_memory_r); \
free_aligned_buffer_page_end(src_memory_g); \
free_aligned_buffer_page_end(src_memory_b); \
free_aligned_buffer_page_end(src_memory_a); \
free_aligned_buffer_page_end(dst_memory_c); \
free_aligned_buffer_page_end(dst_memory_opt); \
}
#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \
NEG benchmark_height_, DEPTH); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \
NEG benchmark_height_, DEPTH); \
} \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
} \
free_aligned_buffer_page_end(src_memory_r); \
free_aligned_buffer_page_end(src_memory_g); \
free_aligned_buffer_page_end(src_memory_b); \
free_aligned_buffer_page_end(dst_memory_c); \
free_aligned_buffer_page_end(dst_memory_opt); \
}
#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
1) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, \
0) \
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
1) \
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10)
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12)
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16)
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10)
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12)
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, dst_pixels_c, kWidth * 4, kWidth, \
NEG benchmark_height_, DEPTH); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, dst_pixels_opt, kWidth * 4, kWidth, \
NEG benchmark_height_, DEPTH); \
} \
for (int i = 0; i < kPixels * 4; ++i) { \
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
} \
free_aligned_buffer_page_end(src_memory_r); \
free_aligned_buffer_page_end(src_memory_g); \
free_aligned_buffer_page_end(src_memory_b); \
free_aligned_buffer_page_end(dst_memory_c); \
free_aligned_buffer_page_end(dst_memory_opt); \
}
#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
1) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {