mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Add MergeAR30Plane, MergeAR64Plane, MergeARGB16To8Plane
These functions merge high bit depth planar RGB pixels into packed format. Change-Id: I506935a164b069e6b2fed8bf152cb874310c0916 Bug: libyuv:886, libyuv:889 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2780468 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
2525698acb
commit
8a13626e42
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1783
|
||||
Version: 1784
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -229,6 +229,60 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Merge separate 'depth' bit R, G and B planes stored in lsb
|
||||
// into one interleaved XR30 plane.
|
||||
// depth should in range [10, 16]
|
||||
LIBYUV_API
|
||||
void MergeXR30Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
uint8_t* dst_ar30,
|
||||
int dst_stride_ar30,
|
||||
int width,
|
||||
int height,
|
||||
int depth);
|
||||
|
||||
// Merge separate 'depth' bit R, G, B and A planes stored in lsb
|
||||
// into one interleaved AR64 plane.
|
||||
// src_a can be NULL to fill opaque value to alpha.
|
||||
// depth should in range [1, 16]
|
||||
LIBYUV_API
|
||||
void MergeAR64Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
const uint16_t* src_a,
|
||||
int src_stride_a,
|
||||
uint16_t* dst_ar64,
|
||||
int dst_stride_ar64,
|
||||
int width,
|
||||
int height,
|
||||
int depth);
|
||||
|
||||
// Merge separate 'depth' bit R, G, B and A planes stored in lsb
|
||||
// into one interleaved ARGB plane.
|
||||
// src_a can be NULL to fill opaque value to alpha.
|
||||
// depth should in range [8, 16]
|
||||
LIBYUV_API
|
||||
void MergeARGB16To8Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
const uint16_t* src_a,
|
||||
int src_stride_a,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height,
|
||||
int depth);
|
||||
|
||||
// Copy I400. Supports inverting.
|
||||
LIBYUV_API
|
||||
int I400ToI400(const uint8_t* src_y,
|
||||
|
||||
@ -289,6 +289,7 @@ extern "C" {
|
||||
#define HAS_I410TOAR30ROW_SSSE3
|
||||
#define HAS_I410TOARGBROW_SSSE3
|
||||
#define HAS_MERGEARGBROW_SSE2
|
||||
#define HAS_MERGEXRGBROW_SSE2
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_MIRRORUVROW_SSSE3
|
||||
#define HAS_P210TOAR30ROW_SSSE3
|
||||
@ -300,6 +301,8 @@ extern "C" {
|
||||
#define HAS_RGBATOYJROW_SSSE3
|
||||
#define HAS_SPLITARGBROW_SSE2
|
||||
#define HAS_SPLITARGBROW_SSSE3
|
||||
#define HAS_SPLITXRGBROW_SSE2
|
||||
#define HAS_SPLITXRGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#define HAS_SWAPUVROW_SSSE3
|
||||
|
||||
@ -330,7 +333,13 @@ extern "C" {
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
#define HAS_DIVIDEROW_16_AVX2
|
||||
#define HAS_HALFMERGEUVROW_AVX2
|
||||
#define HAS_MERGEAR64ROW_AVX2
|
||||
#define HAS_MERGEARGB16TO8ROW_AVX2
|
||||
#define HAS_MERGEARGBROW_AVX2
|
||||
#define HAS_MERGEXR30ROW_AVX2
|
||||
#define HAS_MERGEXR64ROW_AVX2
|
||||
#define HAS_MERGEXRGB16TO8ROW_AVX2
|
||||
#define HAS_MERGEXRGBROW_AVX2
|
||||
#define HAS_I210TOAR30ROW_AVX2
|
||||
#define HAS_I210TOARGBROW_AVX2
|
||||
#define HAS_I212TOAR30ROW_AVX2
|
||||
@ -350,6 +359,7 @@ extern "C" {
|
||||
#define HAS_MULTIPLYROW_16_AVX2
|
||||
#define HAS_RGBATOYJROW_AVX2
|
||||
#define HAS_SPLITARGBROW_AVX2
|
||||
#define HAS_SPLITXRGBROW_AVX2
|
||||
#define HAS_SPLITUVROW_16_AVX2
|
||||
#define HAS_SWAPUVROW_AVX2
|
||||
// TODO(fbarchard): Fix AVX2 version of YUV24
|
||||
@ -423,7 +433,13 @@ extern "C" {
|
||||
#define HAS_I422TOYUY2ROW_NEON
|
||||
#define HAS_I444TOARGBROW_NEON
|
||||
#define HAS_J400TOARGBROW_NEON
|
||||
#define HAS_MERGEAR64ROW_NEON
|
||||
#define HAS_MERGEARGB16TO8ROW_NEON
|
||||
#define HAS_MERGEARGBROW_NEON
|
||||
#define HAS_MERGEXR30ROW_NEON
|
||||
#define HAS_MERGEXR64ROW_NEON
|
||||
#define HAS_MERGEXRGB16TO8ROW_NEON
|
||||
#define HAS_MERGEXRGBROW_NEON
|
||||
#define HAS_MERGEUVROW_NEON
|
||||
#define HAS_MERGEUVROW_16_NEON
|
||||
#define HAS_MIRRORROW_NEON
|
||||
@ -454,6 +470,7 @@ extern "C" {
|
||||
#define HAS_RGBATOYROW_NEON
|
||||
#define HAS_SETROW_NEON
|
||||
#define HAS_SPLITARGBROW_NEON
|
||||
#define HAS_SPLITXRGBROW_NEON
|
||||
#define HAS_SPLITRGBROW_NEON
|
||||
#define HAS_SPLITUVROW_NEON
|
||||
#define HAS_SPLITUVROW_16_NEON
|
||||
@ -676,6 +693,7 @@ extern "C" {
|
||||
#else
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
#endif
|
||||
#define LIBYUV_NOINLINE __declspec(noinline)
|
||||
typedef __declspec(align(16)) int16_t vec16[8];
|
||||
typedef __declspec(align(16)) int32_t vec32[4];
|
||||
typedef __declspec(align(16)) float vecf32[4];
|
||||
@ -696,6 +714,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
|
||||
#else
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
#endif
|
||||
#define LIBYUV_NOINLINE __attribute__((noinline))
|
||||
typedef int16_t __attribute__((vector_size(16))) vec16;
|
||||
typedef int32_t __attribute__((vector_size(16))) vec32;
|
||||
typedef float __attribute__((vector_size(16))) vecf32;
|
||||
@ -711,6 +730,7 @@ typedef uint32_t __attribute__((vector_size(32))) ulvec32;
|
||||
typedef uint8_t __attribute__((vector_size(32))) ulvec8;
|
||||
#else
|
||||
#define SIMD_ALIGNED(var) var
|
||||
#define LIBYUV_NOINLINE
|
||||
typedef int16_t vec16[8];
|
||||
typedef int32_t vec32[4];
|
||||
typedef float vecf32[4];
|
||||
@ -2061,6 +2081,179 @@ void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_b,
|
||||
int width);
|
||||
|
||||
void MergeXR30Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeAR64Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeARGB16To8Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR64Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXRGB16To8Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeAR64Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR64Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int /* depth */,
|
||||
int width);
|
||||
void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_Any_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeAR64Row_Any_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR64Row_Any_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeARGB16To8Row_Any_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXRGB16To8Row_Any_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR30Row_10_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeAR64Row_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeARGB16To8Row_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXR64Row_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width);
|
||||
void MergeXRGB16To8Row_Any_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width);
|
||||
|
||||
void MergeUVRow_16_C(const uint16_t* src_u,
|
||||
const uint16_t* src_v,
|
||||
uint16_t* dst_uv,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1783
|
||||
#define LIBYUV_VERSION 1784
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -1026,7 +1026,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
|
||||
dst_stride_a = 0;
|
||||
}
|
||||
|
||||
#if defined(HAS_SPLITARGBROW_SSE2)
|
||||
#if defined(HAS_SPLITXRGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SplitXRGBRow = SplitXRGBRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
@ -1034,7 +1034,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITARGBROW_SSSE3)
|
||||
#if defined(HAS_SPLITXRGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
@ -1042,7 +1042,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITARGBROW_AVX2)
|
||||
#if defined(HAS_SPLITXRGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
SplitXRGBRow = SplitXRGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1050,7 +1050,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_NEON)
|
||||
#if defined(HAS_SPLITXRGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitXRGBRow = SplitXRGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1112,7 +1112,7 @@ void SplitARGBPlane(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITRGBROW_NEON)
|
||||
#if defined(HAS_SPLITARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitARGBRow = SplitARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1153,13 +1153,13 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
const uint8_t* src_b, uint8_t* dst_argb, int width) =
|
||||
MergeXRGBRow_C;
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
if (src_a == NULL) {
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && dst_stride_argb == width * 4) {
|
||||
@ -1167,7 +1167,7 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEARGBROW_SSE2)
|
||||
#if defined(HAS_MERGEXRGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
MergeXRGBRow = MergeXRGBRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
@ -1175,7 +1175,7 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEARGBROW_AVX2)
|
||||
#if defined(HAS_MERGEXRGBROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeXRGBRow = MergeXRGBRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1183,7 +1183,7 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGERGBROW_NEON)
|
||||
#if defined(HAS_MERGEXRGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeXRGBRow = MergeXRGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1200,12 +1200,6 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
} else {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && src_stride_a == width &&
|
||||
dst_stride_argb == width * 4) {
|
||||
@ -1230,7 +1224,7 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGERGBROW_NEON)
|
||||
#if defined(HAS_MERGEARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeARGBRow = MergeARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
@ -1249,6 +1243,263 @@ void MergeARGBPlane(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void MergeXR30Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
uint8_t* dst_ar30,
|
||||
int dst_stride_ar30,
|
||||
int width,
|
||||
int height,
|
||||
int depth) {
|
||||
int y;
|
||||
void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g,
|
||||
const uint16_t* src_b, uint8_t* dst_ar30, int depth,
|
||||
int width) = MergeXR30Row_C;
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
|
||||
dst_stride_ar30 = -dst_stride_ar30;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
|
||||
dst_stride_ar30 == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEXR30ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeXR30Row = MergeXR30Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeXR30Row = MergeXR30Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEXR30ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
if (depth == 10) {
|
||||
MergeXR30Row = MergeXR30Row_10_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeXR30Row = MergeXR30Row_10_NEON;
|
||||
}
|
||||
} else {
|
||||
MergeXR30Row = MergeXR30Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeXR30Row = MergeXR30Row_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_ar30 += dst_stride_ar30;
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void MergeAR64Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
const uint16_t* src_a,
|
||||
int src_stride_a,
|
||||
uint16_t* dst_ar64,
|
||||
int dst_stride_ar64,
|
||||
int width,
|
||||
int height,
|
||||
int depth) {
|
||||
int y;
|
||||
void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g,
|
||||
const uint16_t* src_b, const uint16_t* src_a,
|
||||
uint16_t* dst_argb, int depth, int width) =
|
||||
MergeAR64Row_C;
|
||||
void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g,
|
||||
const uint16_t* src_b, uint16_t* dst_argb, int depth,
|
||||
int width) = MergeXR64Row_C;
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64;
|
||||
dst_stride_ar64 = -dst_stride_ar64;
|
||||
}
|
||||
if (src_a == NULL) {
|
||||
// Coalesce rows.
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && dst_stride_ar64 == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEXR64ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeXR64Row = MergeXR64Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeXR64Row = MergeXR64Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEXR64ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeXR64Row = MergeXR64Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeXR64Row = MergeXR64Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_ar64 += dst_stride_ar64;
|
||||
}
|
||||
} else {
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && src_stride_a == width &&
|
||||
dst_stride_ar64 == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = src_stride_a =
|
||||
dst_stride_ar64 = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEAR64ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeAR64Row = MergeAR64Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeAR64Row = MergeAR64Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEAR64ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeAR64Row = MergeAR64Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeAR64Row = MergeAR64Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_ar64 += dst_stride_ar64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void MergeARGB16To8Plane(const uint16_t* src_r,
|
||||
int src_stride_r,
|
||||
const uint16_t* src_g,
|
||||
int src_stride_g,
|
||||
const uint16_t* src_b,
|
||||
int src_stride_b,
|
||||
const uint16_t* src_a,
|
||||
int src_stride_a,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height,
|
||||
int depth) {
|
||||
int y;
|
||||
void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
|
||||
const uint16_t* src_b, const uint16_t* src_a,
|
||||
uint8_t* dst_argb, int depth, int width) =
|
||||
MergeARGB16To8Row_C;
|
||||
void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g,
|
||||
const uint16_t* src_b, uint8_t* dst_argb, int depth,
|
||||
int width) = MergeXRGB16To8Row_C;
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
if (src_a == NULL) {
|
||||
// Coalesce rows.
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && dst_stride_argb == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEXRGB16TO8ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEXRGB16TO8ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeXRGB16To8Row = MergeXRGB16To8Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
} else {
|
||||
if (src_stride_r == width && src_stride_g == width &&
|
||||
src_stride_b == width && src_stride_a == width &&
|
||||
dst_stride_argb == width * 4) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_r = src_stride_g = src_stride_b = src_stride_a =
|
||||
dst_stride_argb = 0;
|
||||
}
|
||||
#if defined(HAS_MERGEARGB16TO8ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MergeARGB16To8Row = MergeARGB16To8Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MERGEARGB16TO8ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MergeARGB16To8Row = MergeARGB16To8Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width);
|
||||
src_r += src_stride_r;
|
||||
src_g += src_stride_g;
|
||||
src_b += src_stride_b;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert YUY2 to I422.
|
||||
LIBYUV_API
|
||||
int YUY2ToI422(const uint8_t* src_yuy2,
|
||||
|
||||
@ -183,6 +183,44 @@ ANY41CT(I410AlphaToARGBRow_Any_AVX2,
|
||||
|
||||
#undef ANY41CT
|
||||
|
||||
// Any 4 planes to 1 plane with parameter
|
||||
#define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
|
||||
void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
|
||||
const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \
|
||||
SIMD_ALIGNED(STYPE temp[16 * 4]); \
|
||||
SIMD_ALIGNED(DTYPE out[64]); \
|
||||
memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \
|
||||
} \
|
||||
memcpy(temp, r_buf + n, r * SBPP); \
|
||||
memcpy(temp + 16, g_buf + n, r * SBPP); \
|
||||
memcpy(temp + 32, b_buf + n, r * SBPP); \
|
||||
memcpy(temp + 48, a_buf + n, r * SBPP); \
|
||||
ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, out, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_MERGEAR64ROW_AVX2
|
||||
ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEAR64ROW_NEON
|
||||
ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
|
||||
ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEARGB16TO8ROW_NEON
|
||||
ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
|
||||
#endif
|
||||
|
||||
#undef ANY41PT
|
||||
|
||||
// Any 3 planes to 1.
|
||||
#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
|
||||
@ -212,13 +250,13 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
|
||||
#ifdef HAS_MERGERGBROW_MMI
|
||||
ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
|
||||
#endif
|
||||
#ifdef HAS_MERGEARGBROW_SSE2
|
||||
#ifdef HAS_MERGEXRGBROW_SSE2
|
||||
ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_MERGEARGBROW_AVX2
|
||||
#ifdef HAS_MERGEXRGBROW_AVX2
|
||||
ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_MERGEARGBROW_NEON
|
||||
#ifdef HAS_MERGEXRGBROW_NEON
|
||||
ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_I422TOYUY2ROW_SSE2
|
||||
@ -424,6 +462,52 @@ ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
|
||||
#endif
|
||||
#undef ANY31CT
|
||||
|
||||
// Any 3 planes to 1 plane with parameter
|
||||
#define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \
|
||||
void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \
|
||||
DTYPE* dst_ptr, int depth, int width) { \
|
||||
SIMD_ALIGNED(STYPE temp[16 * 3]); \
|
||||
SIMD_ALIGNED(DTYPE out[64]); \
|
||||
memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \
|
||||
} \
|
||||
memcpy(temp, r_buf + n, r * SBPP); \
|
||||
memcpy(temp + 16, g_buf + n, r * SBPP); \
|
||||
memcpy(temp + 32, b_buf + n, r * SBPP); \
|
||||
ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, out, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_MERGEXR30ROW_AVX2
|
||||
ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR30ROW_NEON
|
||||
ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
|
||||
ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR64ROW_AVX2
|
||||
ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 4, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR64ROW_NEON
|
||||
ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
|
||||
ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXRGB16TO8ROW_NEON
|
||||
ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
|
||||
#endif
|
||||
|
||||
#undef ANY31PT
|
||||
|
||||
// Any 2 planes to 1.
|
||||
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
||||
@ -1711,16 +1795,16 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
|
||||
#ifdef HAS_SPLITRGBROW_MMI
|
||||
ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
|
||||
#endif
|
||||
#ifdef HAS_SPLITARGBROW_SSE2
|
||||
#ifdef HAS_SPLITXRGBROW_SSE2
|
||||
ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SPLITARGBROW_SSSE3
|
||||
#ifdef HAS_SPLITXRGBROW_SSSE3
|
||||
ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_SPLITARGBROW_AVX2
|
||||
#ifdef HAS_SPLITXRGBROW_AVX2
|
||||
ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_SPLITARGBROW_NEON
|
||||
#ifdef HAS_SPLITXRGBROW_NEON
|
||||
ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
|
||||
#endif
|
||||
|
||||
|
||||
@ -56,6 +56,11 @@ static __inline int32_t clamp1023(int32_t v) {
|
||||
return (-(v >= 1023) | v) & 1023;
|
||||
}
|
||||
|
||||
// clamp to 2^n - 1
|
||||
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
|
||||
return (-(v >= max) | v) & max;
|
||||
}
|
||||
|
||||
static __inline uint32_t Abs(int32_t v) {
|
||||
int m = -(v < 0);
|
||||
return (v + m) ^ m;
|
||||
@ -73,6 +78,10 @@ static __inline int32_t clamp1023(int32_t v) {
|
||||
return (v > 1023) ? 1023 : v;
|
||||
}
|
||||
|
||||
static __inline int32_t clamp2nm1(int32_t v, int32_t max) {
|
||||
return (v > max) ? max : v;
|
||||
}
|
||||
|
||||
static __inline uint32_t Abs(int32_t v) {
|
||||
return (v < 0) ? -v : v;
|
||||
}
|
||||
@ -3010,6 +3019,105 @@ void MergeARGBRow_C(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
|
||||
void MergeXR30Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width) {
|
||||
assert(depth >= 10);
|
||||
assert(depth <= 16);
|
||||
int x;
|
||||
int shift = depth - 10;
|
||||
uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30;
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint32_t r = clamp1023(src_r[x] >> shift);
|
||||
uint32_t g = clamp1023(src_g[x] >> shift);
|
||||
uint32_t b = clamp1023(src_b[x] >> shift);
|
||||
dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeAR64Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
assert(depth >= 1);
|
||||
assert(depth <= 16);
|
||||
int x;
|
||||
int shift = 16 - depth;
|
||||
int max = (1 << depth) - 1;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
|
||||
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
|
||||
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
|
||||
dst_ar64[3] = clamp2nm1(src_a[x], max) << shift;
|
||||
dst_ar64 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeARGB16To8Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
assert(depth >= 8);
|
||||
assert(depth <= 16);
|
||||
int x;
|
||||
int shift = depth - 8;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_argb[0] = clamp255(src_b[x] >> shift);
|
||||
dst_argb[1] = clamp255(src_g[x] >> shift);
|
||||
dst_argb[2] = clamp255(src_r[x] >> shift);
|
||||
dst_argb[3] = clamp255(src_a[x] >> shift);
|
||||
dst_argb += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeXR64Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
assert(depth >= 1);
|
||||
assert(depth <= 16);
|
||||
int x;
|
||||
int shift = 16 - depth;
|
||||
int max = (1 << depth) - 1;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_ar64[0] = clamp2nm1(src_b[x], max) << shift;
|
||||
dst_ar64[1] = clamp2nm1(src_g[x], max) << shift;
|
||||
dst_ar64[2] = clamp2nm1(src_r[x], max) << shift;
|
||||
dst_ar64[3] = 0xffff;
|
||||
dst_ar64 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void MergeXRGB16To8Row_C(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
assert(depth >= 8);
|
||||
assert(depth <= 16);
|
||||
int x;
|
||||
int shift = depth - 8;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_argb[0] = clamp255(src_b[x] >> shift);
|
||||
dst_argb[1] = clamp255(src_g[x] >> shift);
|
||||
dst_argb[2] = clamp255(src_r[x] >> shift);
|
||||
dst_argb[3] = 0xff;
|
||||
dst_argb += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void SplitXRGBRow_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
|
||||
@ -5262,7 +5262,9 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXRGBROW_SSE2
|
||||
void MergeXRGBRow_SSE2(const uint8_t* src_r,
|
||||
const uint8_t* src_g,
|
||||
const uint8_t* src_b,
|
||||
@ -5346,7 +5348,9 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXRGBROW_AVX2
|
||||
void MergeXRGBRow_AVX2(const uint8_t* src_r,
|
||||
const uint8_t* src_g,
|
||||
const uint8_t* src_b,
|
||||
@ -5440,7 +5444,9 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITXRGBROW_SSE2
|
||||
void SplitXRGBRow_SSE2(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -5536,7 +5542,9 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
||||
: "m"(kShuffleMaskARGBSplit) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITXRGBROW_SSSE3
|
||||
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -5628,7 +5636,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
"m"(kShuffleMaskARGBPermute) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITXRGBROW_AVX2
|
||||
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -5670,7 +5680,330 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskARGBSplit), // %5
|
||||
"m"(kShuffleMaskARGBPermute) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR30ROW_AVX2
|
||||
void MergeXR30Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = depth - 10;
|
||||
asm volatile(
|
||||
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
|
||||
"vpsrlw $14,%%ymm5,%%ymm5 \n"
|
||||
"vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits
|
||||
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrlw $6,%%ymm6,%%ymm6 \n"
|
||||
"vmovd %5,%%xmm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu (%0,%1),%%ymm1 \n"
|
||||
"vmovdqu (%0,%2),%%ymm2 \n"
|
||||
"vpsrlw %%xmm4,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw %%xmm4,%%ymm1,%%ymm1 \n"
|
||||
"vpsrlw %%xmm4,%%ymm2,%%ymm2 \n"
|
||||
"vpminuw %%ymm0,%%ymm6,%%ymm0 \n"
|
||||
"vpminuw %%ymm1,%%ymm6,%%ymm1 \n"
|
||||
"vpminuw %%ymm2,%%ymm6,%%ymm2 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
|
||||
"vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit
|
||||
"vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB
|
||||
"vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n"
|
||||
"vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG
|
||||
"vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit
|
||||
"vpslld $0xa,%%ymm2,%%ymm2 \n"
|
||||
"vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine
|
||||
"vpor %%ymm2,%%ymm3,%%ymm3 \n"
|
||||
"vmovdqu %%ymm0,(%3) \n"
|
||||
"vmovdqu %%ymm3,0x20(%3) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
#if defined(__i386__)
|
||||
: "m"(shift) // %5
|
||||
#else
|
||||
: "rm"(shift) // %5
|
||||
#endif
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEAR64ROW_AVX2
|
||||
static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7};
|
||||
void MergeAR64Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
mask = (mask << 16) + mask;
|
||||
asm volatile(
|
||||
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"sub %0,%3 \n"
|
||||
"vmovdqa %8,%%ymm5 \n"
|
||||
"vmovd %6,%%xmm6 \n"
|
||||
"vbroadcastss %7,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // R
|
||||
"vmovdqu (%0,%1),%%ymm1 \n" // G
|
||||
"vmovdqu (%0,%2),%%ymm2 \n" // B
|
||||
"vmovdqu (%0,%3),%%ymm3 \n" // A
|
||||
"vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
|
||||
"vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
|
||||
"vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
|
||||
"vpminuw %%ymm3,%%ymm7,%%ymm3 \n"
|
||||
"vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
|
||||
"vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
|
||||
"vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
|
||||
"vpsllw %%xmm6,%%ymm3,%%ymm3 \n"
|
||||
"vpermd %%ymm0,%%ymm5,%%ymm0 \n"
|
||||
"vpermd %%ymm1,%%ymm5,%%ymm1 \n"
|
||||
"vpermd %%ymm2,%%ymm5,%%ymm2 \n"
|
||||
"vpermd %%ymm3,%%ymm5,%%ymm3 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
|
||||
"vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
|
||||
"vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
|
||||
"vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
|
||||
"vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
|
||||
"vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
|
||||
"vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
|
||||
"vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
|
||||
"vmovdqu %%ymm3,(%4) \n"
|
||||
"vmovdqu %%ymm2,0x20(%4) \n"
|
||||
"vmovdqu %%ymm4,0x40(%4) \n"
|
||||
"vmovdqu %%ymm1,0x60(%4) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x80(%4),%4 \n"
|
||||
"subl $0x10,%5 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_ar64), // %4
|
||||
#if defined(__i386__)
|
||||
"+m"(width) // %5
|
||||
: "m"(shift), // %6
|
||||
"m"(mask), // %7
|
||||
#else
|
||||
"+rm"(width) // %5
|
||||
: "rm"(shift), // %6
|
||||
"rm"(mask), // %7
|
||||
#endif
|
||||
"m"(MergeAR64Permute) // %8
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXR64ROW_AVX2
|
||||
void MergeXR64Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
mask = (mask << 16) + mask;
|
||||
asm volatile(
|
||||
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"vmovdqa %7,%%ymm5 \n"
|
||||
"vmovd %5,%%xmm6 \n"
|
||||
"vbroadcastss %6,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // R
|
||||
"vmovdqu (%0,%1),%%ymm1 \n" // G
|
||||
"vmovdqu (%0,%2),%%ymm2 \n" // B
|
||||
"vpminuw %%ymm0,%%ymm7,%%ymm0 \n"
|
||||
"vpminuw %%ymm1,%%ymm7,%%ymm1 \n"
|
||||
"vpminuw %%ymm2,%%ymm7,%%ymm2 \n"
|
||||
"vpsllw %%xmm6,%%ymm0,%%ymm0 \n"
|
||||
"vpsllw %%xmm6,%%ymm1,%%ymm1 \n"
|
||||
"vpsllw %%xmm6,%%ymm2,%%ymm2 \n"
|
||||
"vpermd %%ymm0,%%ymm5,%%ymm0 \n"
|
||||
"vpermd %%ymm1,%%ymm5,%%ymm1 \n"
|
||||
"vpermd %%ymm2,%%ymm5,%%ymm2 \n"
|
||||
"vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff)
|
||||
"vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low)
|
||||
"vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi)
|
||||
"vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low)
|
||||
"vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi)
|
||||
"vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1)
|
||||
"vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3)
|
||||
"vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2)
|
||||
"vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4)
|
||||
"vmovdqu %%ymm3,(%3) \n"
|
||||
"vmovdqu %%ymm2,0x20(%3) \n"
|
||||
"vmovdqu %%ymm4,0x40(%3) \n"
|
||||
"vmovdqu %%ymm1,0x60(%3) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x80(%3),%3 \n"
|
||||
"subl $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar64), // %3
|
||||
"+r"(width) // %4
|
||||
#if defined(__i386__)
|
||||
: "m"(shift), // %5
|
||||
"m"(mask), // %6
|
||||
#else
|
||||
: "rm"(shift), // %5
|
||||
"rm"(mask), // %6
|
||||
#endif
|
||||
"m"(MergeAR64Permute) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
|
||||
static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11,
|
||||
4, 12, 5, 13, 6, 14, 7, 15};
|
||||
void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = depth - 8;
|
||||
asm volatile(
|
||||
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"sub %0,%3 \n"
|
||||
"vbroadcastf128 %7,%%ymm5 \n"
|
||||
"vmovd %6,%%xmm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // R
|
||||
"vmovdqu (%0,%1),%%ymm1 \n" // G
|
||||
"vmovdqu (%0,%2),%%ymm2 \n" // B
|
||||
"vmovdqu (%0,%3),%%ymm3 \n" // A
|
||||
"vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
|
||||
"vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
|
||||
"vpsrlw %%xmm6,%%ymm3,%%ymm3 \n"
|
||||
"vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
|
||||
"vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
|
||||
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
|
||||
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
|
||||
"vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
|
||||
"vmovdqu %%ymm2,(%4) \n"
|
||||
"vmovdqu %%ymm0,0x20(%4) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x40(%4),%4 \n"
|
||||
"subl $0x10,%5 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_argb), // %4
|
||||
#if defined(__i386__)
|
||||
"+m"(width) // %5
|
||||
: "m"(shift), // %6
|
||||
#else
|
||||
"+rm"(width) // %5
|
||||
: "rm"(shift), // %6
|
||||
#endif
|
||||
"m"(MergeARGB16To8Shuffle) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
|
||||
void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = depth - 8;
|
||||
asm volatile(
|
||||
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
"vbroadcastf128 %6,%%ymm5 \n"
|
||||
"vmovd %5,%%xmm6 \n"
|
||||
"vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
|
||||
"vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff)
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // R
|
||||
"vmovdqu (%0,%1),%%ymm1 \n" // G
|
||||
"vmovdqu (%0,%2),%%ymm2 \n" // B
|
||||
"vpsrlw %%xmm6,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw %%xmm6,%%ymm1,%%ymm1 \n"
|
||||
"vpsrlw %%xmm6,%%ymm2,%%ymm2 \n"
|
||||
"vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar)
|
||||
"vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar)
|
||||
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave)
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave)
|
||||
"vpermq $0xd8,%%ymm1,%%ymm1 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low)
|
||||
"vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi)
|
||||
"vmovdqu %%ymm2,(%3) \n"
|
||||
"vmovdqu %%ymm0,0x20(%3) \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"subl $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_argb), // %3
|
||||
"+r"(width) // %4
|
||||
#if defined(__i386__)
|
||||
: "m"(shift), // %5
|
||||
#else
|
||||
: "rm"(shift), // %5
|
||||
#endif
|
||||
"m"(MergeARGB16To8Shuffle) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_uv), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_argb), // %2
|
||||
@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUY2 YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READYUY2 YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READUYVY YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n" READUYVY YUVTORGB
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
@ -760,8 +760,8 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
|
||||
"vld1.8 {q1}, [%1]! \n" // load G
|
||||
"vld1.8 {q0}, [%2]! \n" // load B
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop
|
||||
"vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
|
||||
"vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
|
||||
"vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB
|
||||
"vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
@ -773,6 +773,226 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
|
||||
);
|
||||
}
|
||||
|
||||
void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 10 - depth;
|
||||
asm volatile(
|
||||
"vmov.u32 q14, #1023 \n"
|
||||
"vdup.32 q15, %5 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {d4}, [%2]! \n" // B
|
||||
"vld1.16 {d2}, [%1]! \n" // G
|
||||
"vld1.16 {d0}, [%0]! \n" // R
|
||||
"vmovl.u16 q2, d4 \n" // B
|
||||
"vmovl.u16 q1, d2 \n" // G
|
||||
"vmovl.u16 q0, d0 \n" // R
|
||||
"vshl.u32 q2, q2, q15 \n" // 000B
|
||||
"vshl.u32 q1, q1, q15 \n"
|
||||
"vshl.u32 q0, q0, q15 \n"
|
||||
"vmin.u32 q2, q2, q14 \n"
|
||||
"vmin.u32 q1, q1, q14 \n"
|
||||
"vmin.u32 q0, q0, q14 \n"
|
||||
"vsli.u32 q2, q1, #10 \n" // 00GB
|
||||
"vsli.u32 q2, q0, #20 \n" // 0RGB
|
||||
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
|
||||
"subs %4, %4, #4 \n"
|
||||
"vst1.8 {q2}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift) // %5
|
||||
: "memory", "cc", "q0", "q1", "q2", "q14", "q15");
|
||||
}
|
||||
|
||||
void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int /* depth */,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vmov.u32 q14, #1023 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {d4}, [%2]! \n" // B
|
||||
"vld1.16 {d2}, [%1]! \n" // G
|
||||
"vld1.16 {d0}, [%0]! \n" // R
|
||||
"vmovl.u16 q2, d4 \n" // 000B
|
||||
"vmovl.u16 q1, d2 \n" // G
|
||||
"vmovl.u16 q0, d0 \n" // R
|
||||
"vmin.u32 q2, q2, q14 \n"
|
||||
"vmin.u32 q1, q1, q14 \n"
|
||||
"vmin.u32 q0, q0, q14 \n"
|
||||
"vsli.u32 q2, q1, #10 \n" // 00GB
|
||||
"vsli.u32 q2, q0, #20 \n" // 0RGB
|
||||
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
|
||||
"subs %4, %4, #4 \n"
|
||||
"vst1.8 {q2}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
"3: \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q14");
|
||||
}
|
||||
|
||||
void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile(
|
||||
|
||||
"vdup.u16 q15, %6 \n"
|
||||
"vdup.u16 q14, %7 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vld1.16 {q3}, [%3]! \n" // A
|
||||
"vmin.u16 q2, q2, q14 \n"
|
||||
"vmin.u16 q1, q1, q14 \n"
|
||||
"vmin.u16 q0, q0, q14 \n"
|
||||
"vmin.u16 q3, q3, q14 \n"
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"vshl.u16 q3, q3, q15 \n"
|
||||
"subs %5, %5, #8 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%4]! \n"
|
||||
"vst4.16 {d1, d3, d5, d7}, [%4]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_ar64), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(shift), // %6
|
||||
"r"(mask) // %7
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
|
||||
}
|
||||
|
||||
void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile(
|
||||
|
||||
"vmov.u8 q3, #0xff \n" // A (0xffff)
|
||||
"vdup.u16 q15, %5 \n"
|
||||
"vdup.u16 q14, %6 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vmin.u16 q2, q2, q14 \n"
|
||||
"vmin.u16 q1, q1, q14 \n"
|
||||
"vmin.u16 q0, q0, q14 \n"
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%3]! \n"
|
||||
"vst4.16 {d1, d3, d5, d7}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar64), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift), // %5
|
||||
"r"(mask) // %6
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
|
||||
}
|
||||
|
||||
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile(
|
||||
|
||||
"vdup.16 q15, %6 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vld1.16 {q3}, [%3]! \n" // A
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"vshl.u16 q3, q3, q15 \n"
|
||||
"vqmovn.u16 d0, q0 \n"
|
||||
"vqmovn.u16 d1, q1 \n"
|
||||
"vqmovn.u16 d2, q2 \n"
|
||||
"vqmovn.u16 d3, q3 \n"
|
||||
"subs %5, %5, #8 \n"
|
||||
"vst4.8 {d0, d1, d2, d3}, [%4]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_argb), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(shift) // %6
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q15");
|
||||
}
|
||||
|
||||
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile(
|
||||
|
||||
"vdup.16 q15, %5 \n"
|
||||
"vmov.u8 d6, #0xff \n" // A (0xff)
|
||||
"1: \n"
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"vqmovn.u16 d5, q2 \n"
|
||||
"vqmovn.u16 d4, q1 \n"
|
||||
"vqmovn.u16 d3, q0 \n"
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_argb), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift) // %5
|
||||
: "memory", "cc", "q0", "q1", "q2", "d6", "q15");
|
||||
}
|
||||
|
||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
|
||||
@ -874,6 +874,240 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
|
||||
);
|
||||
}
|
||||
|
||||
void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 10 - depth;
|
||||
asm volatile(
|
||||
"movi v30.16b, #255 \n"
|
||||
"ushr v30.4s, v30.4s, #22 \n" // 1023
|
||||
"dup v31.4s, %w5 \n"
|
||||
"1: \n"
|
||||
"ldr d2, [%2], #8 \n" // B
|
||||
"ldr d1, [%1], #8 \n" // G
|
||||
"ldr d0, [%0], #8 \n" // R
|
||||
"ushll v2.4s, v2.4h, #0 \n" // B
|
||||
"ushll v1.4s, v1.4h, #0 \n" // G
|
||||
"ushll v0.4s, v0.4h, #0 \n" // R
|
||||
"ushl v2.4s, v2.4s, v31.4s \n" // 000B
|
||||
"ushl v1.4s, v1.4s, v31.4s \n" // G
|
||||
"ushl v0.4s, v0.4s, v31.4s \n" // R
|
||||
"umin v2.4s, v2.4s, v30.4s \n"
|
||||
"umin v1.4s, v1.4s, v30.4s \n"
|
||||
"umin v0.4s, v0.4s, v30.4s \n"
|
||||
"sli v2.4s, v1.4s, #10 \n" // 00GB
|
||||
"sli v2.4s, v0.4s, #20 \n" // 0RGB
|
||||
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
|
||||
"subs %w4, %w4, #4 \n"
|
||||
"str q2, [%3], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift) // %5
|
||||
: "memory", "cc", "v0", "v1", "v2", "v30", "v31");
|
||||
}
|
||||
|
||||
void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_ar30,
|
||||
int /* depth */,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"movi v30.16b, #255 \n"
|
||||
"ushr v30.4s, v30.4s, #22 \n" // 1023
|
||||
"1: \n"
|
||||
"ldr d2, [%2], #8 \n" // B
|
||||
"ldr d1, [%1], #8 \n" // G
|
||||
"ldr d0, [%0], #8 \n" // R
|
||||
"ushll v2.4s, v2.4h, #0 \n" // 000B
|
||||
"ushll v1.4s, v1.4h, #0 \n" // G
|
||||
"ushll v0.4s, v0.4h, #0 \n" // R
|
||||
"umin v2.4s, v2.4s, v30.4s \n"
|
||||
"umin v1.4s, v1.4s, v30.4s \n"
|
||||
"umin v0.4s, v0.4s, v30.4s \n"
|
||||
"sli v2.4s, v1.4s, #10 \n" // 00GB
|
||||
"sli v2.4s, v0.4s, #20 \n" // 0RGB
|
||||
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
|
||||
"subs %w4, %w4, #4 \n"
|
||||
"str q2, [%3], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar30), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v30");
|
||||
}
|
||||
|
||||
void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile(
|
||||
|
||||
"dup v30.8h, %w7 \n"
|
||||
"dup v31.8h, %w6 \n"
|
||||
"1: \n"
|
||||
"ldr q2, [%0], #16 \n" // R
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"ldr q3, [%3], #16 \n" // A
|
||||
"umin v2.8h, v2.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umin v1.8h, v1.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"umin v0.8h, v0.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"umin v3.8h, v3.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"ushl v3.8h, v3.8h, v31.8h \n"
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_ar64), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(shift), // %6
|
||||
"r"(mask) // %7
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
|
||||
}
|
||||
|
||||
void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint16_t* dst_ar64,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile(
|
||||
|
||||
"movi v3.16b, #0xff \n" // A (0xffff)
|
||||
"dup v30.8h, %w6 \n"
|
||||
"dup v31.8h, %w5 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldr q2, [%0], #16 \n" // R
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"umin v2.8h, v2.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umin v1.8h, v1.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"umin v0.8h, v0.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_ar64), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift), // %5
|
||||
"r"(mask) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
|
||||
}
|
||||
|
||||
void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
const uint16_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile(
|
||||
|
||||
"dup v31.8h, %w6 \n"
|
||||
"1: \n"
|
||||
"ldr q2, [%0], #16 \n" // R
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"ldr q3, [%3], #16 \n" // A
|
||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"ushl v3.8h, v3.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"uqxtn v2.8b, v2.8h \n"
|
||||
"uqxtn v1.8b, v1.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"uqxtn v3.8b, v3.8h \n"
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(src_a), // %3
|
||||
"+r"(dst_argb), // %4
|
||||
"+r"(width) // %5
|
||||
: "r"(shift) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
|
||||
}
|
||||
|
||||
void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
const uint16_t* src_g,
|
||||
const uint16_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile(
|
||||
|
||||
"dup v31.8h, %w5 \n"
|
||||
"movi v3.8b, #0xff \n" // A (0xff)
|
||||
"1: \n"
|
||||
"ldr q2, [%0], #16 \n" // R
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"uqxtn v2.8b, v2.8h \n"
|
||||
"uqxtn v1.8b, v1.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
"+r"(dst_argb), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(shift) // %5
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v31");
|
||||
}
|
||||
|
||||
// Copy multiple of 32.
|
||||
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
|
||||
@ -3091,6 +3091,164 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
}
|
||||
|
||||
#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
|
||||
TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
|
||||
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
|
||||
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
|
||||
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
|
||||
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
|
||||
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
|
||||
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
|
||||
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
|
||||
STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF); \
|
||||
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
|
||||
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
|
||||
for (int i = 0; i < kPixels; ++i) { \
|
||||
src_pixels_r[i] = fastrand() & 65535; \
|
||||
src_pixels_g[i] = fastrand() & 65535; \
|
||||
src_pixels_b[i] = fastrand() & 65535; \
|
||||
src_pixels_a[i] = fastrand() & 65535; \
|
||||
} \
|
||||
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
|
||||
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \
|
||||
kWidth, NEG benchmark_height_, DEPTH); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \
|
||||
kWidth, NEG benchmark_height_, DEPTH); \
|
||||
} \
|
||||
for (int i = 0; i < kPixels * 4; ++i) { \
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
|
||||
} \
|
||||
free_aligned_buffer_page_end(src_memory_r); \
|
||||
free_aligned_buffer_page_end(src_memory_g); \
|
||||
free_aligned_buffer_page_end(src_memory_b); \
|
||||
free_aligned_buffer_page_end(src_memory_a); \
|
||||
free_aligned_buffer_page_end(dst_memory_c); \
|
||||
free_aligned_buffer_page_end(dst_memory_opt); \
|
||||
}
|
||||
|
||||
#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
|
||||
TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \
|
||||
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
|
||||
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
|
||||
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
|
||||
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
|
||||
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
|
||||
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
|
||||
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
|
||||
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
|
||||
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
|
||||
for (int i = 0; i < kPixels; ++i) { \
|
||||
src_pixels_r[i] = fastrand() & 65535; \
|
||||
src_pixels_g[i] = fastrand() & 65535; \
|
||||
src_pixels_b[i] = fastrand() & 65535; \
|
||||
} \
|
||||
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
|
||||
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \
|
||||
NEG benchmark_height_, DEPTH); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \
|
||||
NEG benchmark_height_, DEPTH); \
|
||||
} \
|
||||
for (int i = 0; i < kPixels * 4; ++i) { \
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
|
||||
} \
|
||||
free_aligned_buffer_page_end(src_memory_r); \
|
||||
free_aligned_buffer_page_end(src_memory_g); \
|
||||
free_aligned_buffer_page_end(src_memory_b); \
|
||||
free_aligned_buffer_page_end(dst_memory_c); \
|
||||
free_aligned_buffer_page_end(dst_memory_opt); \
|
||||
}
|
||||
|
||||
#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
|
||||
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
|
||||
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
|
||||
1) \
|
||||
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
|
||||
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \
|
||||
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, \
|
||||
0) \
|
||||
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
|
||||
1) \
|
||||
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
|
||||
TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
|
||||
|
||||
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10)
|
||||
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12)
|
||||
TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16)
|
||||
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10)
|
||||
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12)
|
||||
TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
|
||||
|
||||
#define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
|
||||
TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
|
||||
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
|
||||
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
|
||||
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
|
||||
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
|
||||
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
|
||||
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
|
||||
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
|
||||
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
|
||||
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
|
||||
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
|
||||
for (int i = 0; i < kPixels; ++i) { \
|
||||
src_pixels_r[i] = fastrand() & 65535; \
|
||||
src_pixels_g[i] = fastrand() & 65535; \
|
||||
src_pixels_b[i] = fastrand() & 65535; \
|
||||
} \
|
||||
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
|
||||
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, dst_pixels_c, kWidth * 4, kWidth, \
|
||||
NEG benchmark_height_, DEPTH); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
|
||||
kWidth, dst_pixels_opt, kWidth * 4, kWidth, \
|
||||
NEG benchmark_height_, DEPTH); \
|
||||
} \
|
||||
for (int i = 0; i < kPixels * 4; ++i) { \
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \
|
||||
} \
|
||||
free_aligned_buffer_page_end(src_memory_r); \
|
||||
free_aligned_buffer_page_end(src_memory_g); \
|
||||
free_aligned_buffer_page_end(src_memory_b); \
|
||||
free_aligned_buffer_page_end(dst_memory_c); \
|
||||
free_aligned_buffer_page_end(dst_memory_opt); \
|
||||
}
|
||||
|
||||
#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
|
||||
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
|
||||
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
|
||||
1) \
|
||||
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
|
||||
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
|
||||
|
||||
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
|
||||
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
|
||||
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
|
||||
|
||||
// TODO(fbarchard): improve test for platforms and cpu detect
|
||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
||||
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user