Add MergeARGBPlane and SplitARGBPlane

These functions convert between planar and interleaved ARGB,
optionally fill 255 to alpha / discard alpha.

This can help handle YUV(A) with Identity matrix, which is
basically planar ARGB.

libyuv_unittest --gtest_filter=LibYUVPlanarTest.*ARGBPlane*:LibYUVPlanarTest.*XRGBPlane*

R=fbarchard@google.com

Change-Id: I522a189b434f490ba1723ce51317727e7c5eb112
Bug: libyuv:877
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2649887
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-01-27 08:55:58 +08:00 committed by Frank Barchard
parent f7c0a73a3e
commit a85cc26fde
10 changed files with 1497 additions and 2 deletions

View File

@ -153,6 +153,38 @@ void MergeRGBPlane(const uint8_t* src_r,
int width, int width,
int height); int height);
// Split interleaved ARGB plane into separate R, G, B and A planes.
// dst_a can be NULL to discard alpha plane.
LIBYUV_API
void SplitARGBPlane(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_r,
int dst_stride_r,
uint8_t* dst_g,
int dst_stride_g,
uint8_t* dst_b,
int dst_stride_b,
uint8_t* dst_a,
int dst_stride_a,
int width,
int height);
// Merge separate R, G, B and A planes into one interleaved ARGB plane.
// src_a can be NULL to fill opaque value to alpha.
LIBYUV_API
void MergeARGBPlane(const uint8_t* src_r,
int src_stride_r,
const uint8_t* src_g,
int src_stride_g,
const uint8_t* src_b,
int src_stride_b,
const uint8_t* src_a,
int src_stride_a,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height);
// Copy I400. Supports inverting. // Copy I400. Supports inverting.
LIBYUV_API LIBYUV_API
int I400ToI400(const uint8_t* src_y, int I400ToI400(const uint8_t* src_y,

View File

@ -280,12 +280,14 @@ extern "C" {
#define HAS_I210TOARGBROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGEARGBROW_SSE2
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_MIRRORUVROW_AVX2
#define HAS_MIRRORUVROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITARGBROW_SSE2
#define HAS_SPLITARGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3 #define HAS_SWAPUVROW_SSSE3
#endif #endif
@ -304,6 +306,7 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2
#define HAS_HALFMERGEUVROW_AVX2 #define HAS_HALFMERGEUVROW_AVX2
#define HAS_MERGEARGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2 #define HAS_I210TOARGBROW_AVX2
#define HAS_I400TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2
@ -311,8 +314,10 @@ extern "C" {
#define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MIRRORUVROW_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#define HAS_RGBATOYJROW_AVX2 #define HAS_RGBATOYJROW_AVX2
#define HAS_SPLITARGBROW_AVX2
#define HAS_SWAPUVROW_AVX2 #define HAS_SWAPUVROW_AVX2
// TODO(fbarchard): Fix AVX2 version of YUV24 // TODO(fbarchard): Fix AVX2 version of YUV24
// #define HAS_NV21TOYUV24ROW_AVX2 // #define HAS_NV21TOYUV24ROW_AVX2
@ -373,6 +378,7 @@ extern "C" {
#define HAS_I422TOYUY2ROW_NEON #define HAS_I422TOYUY2ROW_NEON
#define HAS_I444TOARGBROW_NEON #define HAS_I444TOARGBROW_NEON
#define HAS_J400TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON
#define HAS_MERGEARGBROW_NEON
#define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON #define HAS_MIRRORUVROW_NEON
@ -400,6 +406,7 @@ extern "C" {
#define HAS_RGBATOYJROW_NEON #define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYROW_NEON #define HAS_RGBATOYROW_NEON
#define HAS_SETROW_NEON #define HAS_SETROW_NEON
#define HAS_SPLITARGBROW_NEON
#define HAS_SPLITRGBROW_NEON #define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
#define HAS_SWAPUVROW_NEON #define HAS_SWAPUVROW_NEON
@ -1823,6 +1830,182 @@ void MergeRGBRow_Any_MMI(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_rgb, uint8_t* dst_rgb,
int width); int width);
void MergeARGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_Any_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_Any_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void SplitARGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void MergeXRGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_Any_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_Any_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void SplitXRGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void MergeUVRow_16_C(const uint16_t* src_u, void MergeUVRow_16_C(const uint16_t* src_u,
const uint16_t* src_v, const uint16_t* src_v,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1772 #define LIBYUV_VERSION 1773
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -772,6 +772,270 @@ void MergeRGBPlane(const uint8_t* src_r,
} }
} }
LIBYUV_API
void SplitARGBPlane(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_r,
int dst_stride_r,
uint8_t* dst_g,
int dst_stride_g,
uint8_t* dst_b,
int dst_stride_b,
uint8_t* dst_a,
int dst_stride_a,
int width,
int height) {
int y;
void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_a, int width) =
SplitARGBRow_C;
void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
uint8_t* dst_b, int width) = SplitXRGBRow_C;
if (dst_a == NULL) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_r = dst_r + (height - 1) * dst_stride_r;
dst_g = dst_g + (height - 1) * dst_stride_g;
dst_b = dst_b + (height - 1) * dst_stride_b;
dst_stride_r = -dst_stride_r;
dst_stride_g = -dst_stride_g;
dst_stride_b = -dst_stride_b;
}
// Coalesce rows.
if (src_stride_argb == width * 4 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
dst_stride_a = 0;
}
#if defined(HAS_SPLITARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitXRGBRow = SplitXRGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
SplitXRGBRow = SplitXRGBRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SplitXRGBRow = SplitXRGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
SplitXRGBRow = SplitXRGBRow_SSSE3;
}
}
#endif
#if defined(HAS_SPLITARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitXRGBRow = SplitXRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
SplitXRGBRow = SplitXRGBRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitXRGBRow = SplitXRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitXRGBRow = SplitXRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
dst_r += dst_stride_r;
dst_g += dst_stride_g;
dst_b += dst_stride_b;
src_argb += src_stride_argb;
}
} else {
if (height < 0) {
height = -height;
dst_r = dst_r + (height - 1) * dst_stride_r;
dst_g = dst_g + (height - 1) * dst_stride_g;
dst_b = dst_b + (height - 1) * dst_stride_b;
dst_a = dst_a + (height - 1) * dst_stride_a;
dst_stride_r = -dst_stride_r;
dst_stride_g = -dst_stride_g;
dst_stride_b = -dst_stride_b;
dst_stride_a = -dst_stride_a;
}
if (src_stride_argb == width * 4 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width &&
dst_stride_a == width) {
width *= height;
height = 1;
src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b =
dst_stride_a = 0;
}
#if defined(HAS_SPLITARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitARGBRow = SplitARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
SplitARGBRow = SplitARGBRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
SplitARGBRow = SplitARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
SplitARGBRow = SplitARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_SPLITARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitARGBRow = SplitARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
SplitARGBRow = SplitARGBRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITRGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitARGBRow = SplitARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitARGBRow = SplitARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
dst_r += dst_stride_r;
dst_g += dst_stride_g;
dst_b += dst_stride_b;
dst_a += dst_stride_a;
src_argb += src_stride_argb;
}
}
}
LIBYUV_API
void MergeARGBPlane(const uint8_t* src_r,
int src_stride_r,
const uint8_t* src_g,
int src_stride_g,
const uint8_t* src_b,
int src_stride_b,
const uint8_t* src_a,
int src_stride_a,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int y;
void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g,
const uint8_t* src_b, const uint8_t* src_a,
uint8_t* dst_argb, int width) = MergeARGBRow_C;
void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
const uint8_t* src_b, uint8_t* dst_argb, int width) =
MergeXRGBRow_C;
if (src_a == NULL) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
// Coalesce rows.
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0;
}
#if defined(HAS_MERGEARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeXRGBRow = MergeXRGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
MergeXRGBRow = MergeXRGBRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeXRGBRow = MergeXRGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeXRGBRow = MergeXRGBRow_AVX2;
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeXRGBRow = MergeXRGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MergeXRGBRow = MergeXRGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_argb += dst_stride_argb;
}
} else {
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
if (src_stride_r == width && src_stride_g == width &&
src_stride_b == width && src_stride_a == width &&
dst_stride_argb == width * 4) {
width *= height;
height = 1;
src_stride_r = src_stride_g = src_stride_b = src_stride_a =
dst_stride_argb = 0;
}
#if defined(HAS_MERGEARGBROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeARGBRow = MergeARGBRow_Any_SSE2;
if (IS_ALIGNED(width, 8)) {
MergeARGBRow = MergeARGBRow_SSE2;
}
}
#endif
#if defined(HAS_MERGEARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeARGBRow = MergeARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeARGBRow = MergeARGBRow_AVX2;
}
}
#endif
#if defined(HAS_MERGERGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeARGBRow = MergeARGBRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MergeARGBRow = MergeARGBRow_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
src_r += src_stride_r;
src_g += src_stride_g;
src_b += src_stride_b;
dst_argb += dst_stride_argb;
}
}
}
// Convert YUY2 to I422. // Convert YUY2 to I422.
LIBYUV_API LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2, int YUY2ToI422(const uint8_t* src_yuy2,

View File

@ -30,6 +30,37 @@ extern "C" {
// Subsampled source needs to be increase by 1 of not even. // Subsampled source needs to be increase by 1 of not even.
#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
// Any 4 planes to 1
#define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
int width) { \
SIMD_ALIGNED(uint8_t temp[64 * 5]); \
memset(temp, 0, 64 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 192, a_buf + n, r); \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \
SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_MERGEARGBROW_SSE2
ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7)
#endif
#ifdef HAS_MERGEARGBROW_AVX2
ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_MERGEARGBROW_NEON
ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15)
#endif
// Any 4 planes to 1 with yuvconstants // Any 4 planes to 1 with yuvconstants
#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
@ -113,6 +144,15 @@ ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
#ifdef HAS_MERGERGBROW_MMI #ifdef HAS_MERGERGBROW_MMI
ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7)
#endif #endif
#ifdef HAS_MERGEARGBROW_SSE2
ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7)
#endif
#ifdef HAS_MERGEARGBROW_AVX2
ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15)
#endif
#ifdef HAS_MERGEARGBROW_NEON
ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15)
#endif
#ifdef HAS_I422TOYUY2ROW_SSE2 #ifdef HAS_I422TOYUY2ROW_SSE2
ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
@ -1382,6 +1422,51 @@ ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
#ifdef HAS_SPLITRGBROW_MMI #ifdef HAS_SPLITRGBROW_MMI
ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
#endif #endif
#ifdef HAS_SPLITARGBROW_SSE2
ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_SSSE3
ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_AVX2
ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15)
#endif
#ifdef HAS_SPLITARGBROW_NEON
ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15)
#endif
// Any 1 to 4. Outputs ARGB planes.
#define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \
uint8_t* dst_b, uint8_t* dst_a, int width) { \
SIMD_ALIGNED(uint8_t temp[16 * 8]); \
memset(temp, 0, 16 * 4); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \
} \
memcpy(temp, src_ptr + n * BPP, r * BPP); \
ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \
MASK + 1); \
memcpy(dst_r + n, temp + 16 * 4, r); \
memcpy(dst_g + n, temp + 16 * 5, r); \
memcpy(dst_b + n, temp + 16 * 6, r); \
memcpy(dst_a + n, temp + 16 * 7, r); \
}
#ifdef HAS_SPLITARGBROW_SSE2
ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_SSSE3
ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7)
#endif
#ifdef HAS_SPLITARGBROW_AVX2
ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15)
#endif
#ifdef HAS_SPLITARGBROW_NEON
ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
#endif
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels. // 128 byte row allows for 32 avx ARGB pixels.

View File

@ -2476,6 +2476,67 @@ void MergeRGBRow_C(const uint8_t* src_r,
} }
} }
void SplitARGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_b[x] = src_argb[0];
dst_g[x] = src_argb[1];
dst_r[x] = src_argb[2];
dst_a[x] = src_argb[3];
src_argb += 4;
}
}
void MergeARGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_argb[0] = src_b[x];
dst_argb[1] = src_g[x];
dst_argb[2] = src_r[x];
dst_argb[3] = src_a[x];
dst_argb += 4;
}
}
void SplitXRGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_b[x] = src_argb[0];
dst_g[x] = src_argb[1];
dst_r[x] = src_argb[2];
src_argb += 4;
}
}
void MergeXRGBRow_C(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width; ++x) {
dst_argb[0] = src_b[x];
dst_argb[1] = src_g[x];
dst_argb[2] = src_r[x];
dst_argb[3] = 255;
dst_argb += 4;
}
}
// Use scale to convert lsb formats to msb, depending how many bits there are: // Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits // 128 = 9 bits
// 64 = 10 bits // 64 = 10 bits

View File

@ -4075,6 +4075,446 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
} }
#endif // HAS_MERGERGBROW_SSSE3 #endif // HAS_MERGERGBROW_SSSE3
#ifdef HAS_MERGEARGBROW_SSE2
void MergeARGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"sub %0,%3 \n"
LABELALIGN
"1: \n"
"movq (%0,%2),%%xmm0 \n" // B
"movq (%0),%%xmm1 \n" // R
"movq (%0,%1),%%xmm2 \n" // G
"punpcklbw %%xmm1,%%xmm0 \n" // BR
"movq (%0,%3),%%xmm1 \n" // A
"punpcklbw %%xmm1,%%xmm2 \n" // GA
"movdqa %%xmm0,%%xmm1 \n" // BR
"punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
"punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
"movdqu %%xmm0,(%4) \n"
"movdqu %%xmm1,16(%4) \n"
"lea 8(%0),%0 \n"
"lea 32(%4),%4 \n"
"sub $0x8,%5 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
void MergeXRGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile(
LABELALIGN
"1: \n"
"movq (%2),%%xmm0 \n" // B
"movq (%0),%%xmm1 \n" // R
"movq (%1),%%xmm2 \n" // G
"punpcklbw %%xmm1,%%xmm0 \n" // BR
"pcmpeqd %%xmm1,%%xmm1 \n" // A(255)
"punpcklbw %%xmm1,%%xmm2 \n" // GA
"movdqa %%xmm0,%%xmm1 \n" // BR
"punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi)
"punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo)
"movdqu %%xmm0,(%3) \n"
"movdqu %%xmm1,16(%3) \n"
"lea 8(%0),%0 \n"
"lea 8(%1),%1 \n"
"lea 8(%2),%2 \n"
"lea 32(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEARGBROW_SSE2
#ifdef HAS_MERGEARGBROW_AVX2
void MergeARGBRow_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile(
"sub %0,%1 \n"
"sub %0,%2 \n"
"sub %0,%3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0,%2),%%xmm0 \n" // B
"vmovdqu (%0,%1),%%xmm1 \n" // R
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
"vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A
"vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
"vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
"vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
"vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%4) \n" // First 8
"vmovdqu %%ymm1,32(%4) \n" // Next 8
"lea 16(%0),%0 \n"
"lea 64(%4),%4 \n"
"sub $0x10,%5 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
void MergeXRGBRow_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile(
LABELALIGN
"1: \n"
"vmovdqu (%2),%%xmm0 \n" // B
"vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
"vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
"vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
"vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
"vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n"
"vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%3) \n" // First 8
"vmovdqu %%ymm1,32(%3) \n" // Next 8
"lea 16(%0),%0 \n"
"lea 16(%1),%1 \n"
"lea 16(%2),%2 \n"
"lea 64(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEARGBROW_AVX2
#ifdef HAS_SPLITARGBROW_SSE2
void SplitARGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile(
"subq %1,%2 \n"
"subq %1,%3 \n"
"subq %1,%4 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"movdqa %%xmm0,%%xmm2 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
"punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
"punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
"punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
"punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
"movlps %%xmm0,(%1,%3) \n" // B
"movhps %%xmm0,(%1,%2) \n" // G
"movlps %%xmm2,(%1) \n" // R
"movhps %%xmm2,(%1,%4) \n" // A
"lea 32(%0),%0 \n"
"lea 8(%1),%1 \n"
"sub $0x8,%5 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
void SplitXRGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"movdqa %%xmm0,%%xmm2 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17
"punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo)
"punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B
"punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo)
"punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
"movlps %%xmm0,(%3) \n" // B
"movhps %%xmm0,(%2) \n" // G
"movlps %%xmm2,(%1) \n" // R
"lea 32(%0),%0 \n"
"lea 8(%1),%1 \n"
"lea 8(%2),%2 \n"
"lea 8(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
#ifdef HAS_SPLITARGBROW_SSSE3
static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u,
2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
void SplitARGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile(
"subq %1,%2 \n"
"subq %1,%3 \n"
"subq %1,%4 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"pshufb %6,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %6,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
"movlps %%xmm0,(%1,%3) \n" // B
"movhps %%xmm0,(%1,%2) \n" // G
"movlps %%xmm2,(%1) \n" // R
"movhps %%xmm2,(%1,%4) \n" // A
"lea 32(%0),%0 \n"
"lea 8(%1),%1 \n"
"sub $0x8,%5 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
: "m"(kShuffleMaskARGBSplit) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"pshufb %5,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %5,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
"movlps %%xmm0,(%3) \n" // B
"movhps %%xmm0,(%2) \n" // G
"movlps %%xmm2,(%1) \n" // R
"lea 32(%0),%0 \n"
"lea 8(%1),%1 \n"
"lea 8(%2),%2 \n"
"lea 8(%3),%3 \n"
"sub $0x8,%4 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif
#ifdef HAS_SPLITARGBROW_AVX2
static const lvec8 kShuffleMaskARGBSplit_AVX2 = {
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u,
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u,
2u, 6u, 3u, 7u};
void SplitARGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile(
"subq %1,%2 \n"
"subq %1,%3 \n"
"subq %1,%4 \n"
"vmovdqu %7,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00-0F
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
"vpshufb %6,%%ymm0,%%ymm0 \n"
"vpshufb %6,%%ymm1,%%ymm1 \n"
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
"vmovdqu %%xmm0,(%1,%3) \n" // B
"vextracti128 $1,%%ymm0,(%1) \n" // R
"vmovdqu %%xmm2,(%1,%2) \n" // G
"vextracti128 $1,%%ymm2,(%1,%4) \n" // A
"lea 64(%0),%0 \n"
"lea 16(%1),%1 \n"
"sub $0x10,%5 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
: "m"(kShuffleMaskARGBSplit_AVX2), // %6
"m"(kShuffleMaskARGBPermute_AVX2) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile(
"vmovdqu %6,%%ymm3 \n" LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00-0F
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
"vpshufb %5,%%ymm0,%%ymm0 \n"
"vpshufb %5,%%ymm1,%%ymm1 \n"
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR
"vmovdqu %%xmm0,(%3) \n" // B
"vextracti128 $1,%%ymm0,(%1) \n" // R
"vmovdqu %%xmm2,(%2) \n" // G
"lea 64(%0),%0 \n"
"lea 16(%1),%1 \n"
"lea 16(%2),%2 \n"
"lea 16(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit_AVX2), // %5
"m"(kShuffleMaskARGBPermute_AVX2) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif
#ifdef HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile(

View File

@ -666,6 +666,113 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
); );
} }
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
void SplitARGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
"subs %5, %5, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%3]! \n" // store B
"vst1.8 {q1}, [%2]! \n" // store G
"vst1.8 {q2}, [%1]! \n" // store R
"vst1.8 {q3}, [%4]! \n" // store A
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile(
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
"vld1.8 {q0}, [%2]! \n" // load B
"vld1.8 {q3}, [%3]! \n" // load A
"subs %5, %5, #16 \n" // 16 processed per loop
"vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
"vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
"subs %4, %4, #16 \n" // 16 processed per loop
"vst1.8 {q0}, [%3]! \n" // store B
"vst1.8 {q1}, [%2]! \n" // store G
"vst1.8 {q2}, [%1]! \n" // store R
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile(
"vmov.u8 q3, #255 \n" // load A(255)
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
"vld1.8 {q0}, [%2]! \n" // load B
"subs %4, %4, #16 \n" // 16 processed per loop
"vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
"vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
"bgt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
);
}
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile(

View File

@ -763,6 +763,118 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
); );
} }
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
void SplitARGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"prfm pldl1keep, [%0, 448] \n"
"subs %w5, %w5, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
"st1 {v3.16b}, [%4], #16 \n" // store A
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(dst_a), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile(
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v3.16b}, [%3], #16 \n" // load A
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"subs %w5, %w5, #16 \n" // 16 processed per loop
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%3], #16 \n" // store B
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%1], #16 \n" // store R
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile(
"movi v3.16b, #255 \n" // load A(255)
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
// Copy multiple of 32. // Copy multiple of 32.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile(

View File

@ -2776,6 +2776,217 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c); free_aligned_buffer_page_end(dst_pixels_c);
} }
TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(tmp_pixels_a, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
MemRandomize(src_pixels, kPixels * 4);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(tmp_pixels_a, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 4);
MemRandomize(dst_pixels_c, kPixels * 4);
MaskCpuFlags(disable_cpu_flags_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, tmp_pixels_a, benchmark_width_,
benchmark_width_, benchmark_height_);
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, tmp_pixels_a, benchmark_width_,
benchmark_width_, benchmark_height_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
benchmark_width_, tmp_pixels_b, benchmark_width_,
tmp_pixels_a, benchmark_width_, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(tmp_pixels_a);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(tmp_pixels_a, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
MemRandomize(src_pixels, kPixels * 4);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(tmp_pixels_a, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 4);
MemRandomize(dst_pixels_c, kPixels * 4);
MaskCpuFlags(disable_cpu_flags_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, tmp_pixels_a, benchmark_width_,
benchmark_width_, benchmark_height_);
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
dst_pixels_c, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, tmp_pixels_a,
benchmark_width_, benchmark_width_, benchmark_height_);
}
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_,
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(tmp_pixels_a);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
MemRandomize(src_pixels, kPixels * 4);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 4);
MemRandomize(dst_pixels_c, kPixels * 4);
MaskCpuFlags(disable_cpu_flags_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, NULL, 0, benchmark_width_,
benchmark_height_);
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, NULL, 0, benchmark_width_,
benchmark_height_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g,
benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0,
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
}
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
align_buffer_page_end(tmp_pixels_b, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
MemRandomize(src_pixels, kPixels * 4);
MemRandomize(tmp_pixels_r, kPixels);
MemRandomize(tmp_pixels_g, kPixels);
MemRandomize(tmp_pixels_b, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 4);
MemRandomize(dst_pixels_c, kPixels * 4);
MaskCpuFlags(disable_cpu_flags_);
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b,
benchmark_width_, NULL, 0, benchmark_width_,
benchmark_height_);
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r,
benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_,
benchmark_height_);
}
MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_,
tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_r);
free_aligned_buffer_page_end(tmp_pixels_g);
free_aligned_buffer_page_end(tmp_pixels_b);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
// TODO(fbarchard): improve test for platforms and cpu detect // TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUVROW_16_AVX2 #ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {