NV12Mirror and MirrorUVPlane functions added

HalfMergeUV AVX2 version

Skylake Xeon performance for 1280x720
NV12Mirror_Any (109 ms)
NV12Mirror_Unaligned (113 ms)
NV12Mirror_Invert (107 ms)
NV12Mirror_Opt (108 ms)
NV12Mirror_NullY (19 ms)

Slightly faster than comparable I420Mirror
I420Mirror_Any (113 ms)
I420Mirror_Unaligned (110 ms)
I420Mirror_Invert (109 ms)
I420Mirror_Opt (110 ms)

BUG=libyuv:840, libyuv:858

Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2020-05-04 12:32:28 -07:00 committed by Commit Bot
parent d9681c53b3
commit 7a61759f78
17 changed files with 432 additions and 87 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1749 Version: 1751
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
height chroma channel, and therefore is a 420 subsampling. height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422. NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444. NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
Most NV12 functions allow the destination Y pointer to be NULL.

View File

@ -190,7 +190,7 @@ mips
make V=1 -f linux.mk make V=1 -f linux.mk
make V=1 -f linux.mk clean make V=1 -f linux.mk clean
make V=1 -f linux.mk CXX=clang++ make V=1 -f linux.mk CXX=clang++ CC=clang
## Building the library with cmake ## Building the library with cmake

View File

@ -314,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
int width, int width,
int height); int height);
// Alias
#define NV12ToNV12Mirror NV12Mirror
// NV12 mirror.
LIBYUV_API
int NV12Mirror(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Alias // Alias
#define ARGBToARGBMirror ARGBMirror #define ARGBToARGBMirror ARGBMirror
@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
int width, int width,
int height); int height);
// Mirror a plane of UV data.
LIBYUV_API
void MirrorUVPlane(const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert NV12 to RGB565. // Convert NV12 to RGB565.
LIBYUV_API LIBYUV_API
int NV12ToRGB565(const uint8_t* src_y, int NV12ToRGB565(const uint8_t* src_y,

View File

@ -274,16 +274,18 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2 #define HAS_CONVERT8TO16ROW_SSE2
#define HAS_HALFMERGEUVROW_SSSE3 #define HAS_HALFMERGEUVROW_SSSE3
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_MIRRORUVROW_AVX2
#define HAS_MIRRORUVROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3 #define HAS_SWAPUVROW_SSSE3
#endif #endif
// The following are available for AVX2 gcc/clang x86 platforms: // The following are available for AVX2 gcc/clang x86 platforms:
@ -299,6 +301,7 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2
#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2 #define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOAR30ROW_AVX2
@ -368,6 +371,7 @@ extern "C" {
#define HAS_J400TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON #define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
#define HAS_MIRRORSPLITUVROW_NEON #define HAS_MIRRORSPLITUVROW_NEON
#define HAS_NV12TOARGBROW_NEON #define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB24ROW_NEON
@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorSplitUVRow_SSSE3(const uint8_t* src, void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u, uint8_t* dst_u,
@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
uint8_t* dst_uv, uint8_t* dst_uv,
int width); int width);
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_uv,
int width);
void SplitRGBRow_C(const uint8_t* src_rgb, void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_g,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1749 #define LIBYUV_VERSION 1751
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -775,7 +775,7 @@ int YUY2ToI420(const uint8_t* src_yuy2,
} }
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) #if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToYRow = YUY2ToYRow_Any_MSA;
YUY2ToUVRow = YUY2ToUVRow_Any_MSA; YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@ -1476,7 +1476,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
} }
} }
#endif #endif
#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) #if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MSA; RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
RGB24ToYRow = RGB24ToYRow_Any_MSA; RGB24ToYRow = RGB24ToYRow_Any_MSA;

View File

@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
} }
} }
// Mirror a plane of UV data.
LIBYUV_API
void MirrorUVPlane(const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
MirrorUVRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uv = src_uv + (height - 1) * src_stride_uv;
src_stride_uv = -src_stride_uv;
}
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorUVRow = MirrorUVRow_Any_NEON;
if (IS_ALIGNED(width, 32)) {
MirrorUVRow = MirrorUVRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorUVRow = MirrorUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
MirrorUVRow = MirrorUVRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorUVRow = MirrorUVRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MirrorUVRow = MirrorUVRow_AVX2;
}
}
#endif
// MirrorUV plane
for (y = 0; y < height; ++y) {
MirrorUVRow(src_uv, dst_uv, width);
src_uv += src_stride_uv;
dst_uv += dst_stride_uv;
}
}
// Mirror I400 with optional flipping // Mirror I400 with optional flipping
LIBYUV_API LIBYUV_API
int I400Mirror(const uint8_t* src_y, int I400Mirror(const uint8_t* src_y,
@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
int height) { int height) {
int halfwidth = (width + 1) >> 1; int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1; int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 || if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) { height == 0) {
return -1; return -1;
} }
@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
return 0; return 0;
} }
// NV12 mirror.
LIBYUV_API
int NV12Mirror(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_uv = src_uv + (halfheight - 1) * src_stride_uv;
src_stride_y = -src_stride_y;
src_stride_uv = -src_stride_uv;
}
if (dst_y) {
MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
halfheight);
return 0;
}
// ARGB mirror. // ARGB mirror.
LIBYUV_API LIBYUV_API
int ARGBMirror(const uint8_t* src_argb, int ARGBMirror(const uint8_t* src_argb,
@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON) #if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON; ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON; ARGBMirrorRow = ARGBMirrorRow_NEON;
} }
} }
@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
HalfMergeUVRow = HalfMergeUVRow_SSSE3; HalfMergeUVRow = HalfMergeUVRow_SSSE3;
} }
#endif #endif
#if defined(HAS_HALFMERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
HalfMergeUVRow = HalfMergeUVRow_AVX2;
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
// Merge a row of U and V into a row of UV. // Merge a row of U and V into a row of UV.
HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);

View File

@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
int width) = MirrorSplitUVRow_C; int width) = MirrorSplitUVRow_C;
#if defined(HAS_MIRRORSPLITUVROW_NEON) #if defined(HAS_MIRRORSPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
MirrorSplitUVRow = MirrorSplitUVRow_NEON; MirrorSplitUVRow = MirrorSplitUVRow_NEON;
} }
#endif #endif

View File

@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON) #if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON; ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON; ARGBMirrorRow = ARGBMirrorRow_NEON;
} }
} }

View File

@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#ifdef HAS_MIRRORROW_MMI #ifdef HAS_MIRRORROW_MMI
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
#endif #endif
#ifdef HAS_MIRRORUVROW_AVX2
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
#endif
#ifdef HAS_MIRRORUVROW_SSSE3
ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
#endif
#ifdef HAS_MIRRORUVROW_NEON
ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
#endif
#ifdef HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif #endif
@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif #endif
#ifdef HAS_ARGBMIRRORROW_NEON #ifdef HAS_ARGBMIRRORROW_NEON
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15) ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif #endif
#ifdef HAS_ARGBMIRRORROW_MSA #ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)

View File

@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
} }
} }
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width; ++x) {
dst_uv[0] = src_uv[0];
dst_uv[1] = src_uv[1];
src_uv -= 2;
dst_uv += 2;
}
}
void MirrorSplitUVRow_C(const uint8_t* src_uv, void MirrorSplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,

View File

@ -3229,10 +3229,62 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
} }
#endif // HAS_MIRRORROW_AVX2 #endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
"movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu -0x10(%0,%2,2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirrorUV) // %3
: "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_MIRRORUVROW_AVX2
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirrorUV) // %3
: "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORUVROW_AVX2
#ifdef HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels. // Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
void MirrorSplitUVRow_SSSE3(const uint8_t* src, void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
@ -3253,11 +3305,11 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
"lea 0x8(%1),%1 \n" "lea 0x8(%1),%1 \n"
"sub $8,%3 \n" "sub $8,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(temp_width) // %3 "+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4 : "m"(kShuffleMirrorSplitUV) // %4
: "memory", "cc", "xmm0", "xmm1"); : "memory", "cc", "xmm0", "xmm1");
} }
#endif // HAS_MIRRORSPLITUVROW_SSSE3 #endif // HAS_MIRRORSPLITUVROW_SSSE3
@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
} }
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_uv,
int width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"1: \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // load 32 U values
"vmovdqu (%1),%%ymm1 \n" // load 32 V values
"vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
"vmovdqu 0(%1,%5,1),%%ymm3 \n"
"lea 0x20(%0),%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x20(%1),%1 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
"lea 0x20(%2),%2 \n"
"sub $0x20,%3 \n" // 32 src pixels per loop
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride_u)), // %4
"r"((intptr_t)(src_stride_v)) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // defined(__x86_64__) || defined(__i386__) #endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
: "cc", "memory", "q0", "q1", "q2"); : "cc", "memory", "q0", "q1", "q2");
} }
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %2, lsl #1 \n"
"sub %0, #16 \n"
"1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %2, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
"vst2.8 {d0, d1}, [%1]! \n" // dst += 16
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
:
: "cc", "memory", "r12", "q0");
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv, void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,

View File

@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile(
// Start at end of source row. // Start at end of source row.
"ld1 {v3.16b}, [%4] \n" // shuffler "ld1 {v3.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw \n" "add %0, %0, %w2, sxtw \n"
"sub %0, %0, #32 \n" "sub %0, %0, #32 \n"
"1: \n" "1: \n"
"ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32 "ldr q2, [%0, 16] \n"
"ldr q1, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #32 \n" // 32 pixels per loop. "subs %w2, %w2, #32 \n" // 32 pixels per loop.
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v0.16b, {v2.16b}, v3.16b \n" "tbl v0.16b, {v2.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"((ptrdiff_t)-32), // %3 : "r"(&kShuffleMirror) // %3
"r"(&kShuffleMirror) // %4
: "cc", "memory", "v0", "v1", "v2", "v3"); : "cc", "memory", "v0", "v1", "v2", "v3");
} }
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"(&kShuffleMirrorUV) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv, void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile( asm volatile(
// Start at end of source row. // Start at end of source row.
"ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w3, sxtw #1 \n" "add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #16 \n" "sub %0, %0, #32 \n"
"1: \n" "1: \n"
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 "ldr q1, [%0, 16] \n"
"subs %w3, %w3, #8 \n" // 8 pixels per loop. "ldr q0, [%0], -32 \n" // src -= 32
"rev64 v0.8b, v0.8b \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop.
"rev64 v1.8b, v1.8b \n" "tbl v2.16b, {v1.16b}, v4.16b \n"
"st1 {v0.8b}, [%1], #8 \n" // dst += 8 "tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v1.8b}, [%2], #8 \n" "uzp1 v0.16b, v2.16b, v3.16b \n" // U
"uzp2 v1.16b, v2.16b, v3.16b \n" // V
"st1 {v0.16b}, [%1], #16 \n" // dst += 16
"st1 {v1.16b}, [%2], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"((ptrdiff_t)-16) // %4 : "r"(&kShuffleMirrorUV) // %4
: "cc", "memory", "v0", "v1"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
} }
// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile( asm volatile(
"ld1 {v4.16b}, [%4] \n" // shuffler // Start at end of source row.
"add %0, %0, %w2, sxtw #2 \n" // Start at end of row. "ld1 {v4.16b}, [%3] \n" // shuffler
"sub %0, %0, #64 \n" "add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #32 \n"
"1: \n" "1: \n"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64 "ldr q1, [%0, 16] \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop. "ldr q0, [%0], -32 \n" // src -= 32
"tbl v0.16b, {v0.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop.
"tbl v1.16b, {v1.16b}, v4.16b \n" "tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v2.16b, {v2.16b}, v4.16b \n" "tbl v3.16b, {v0.16b}, v4.16b \n"
"tbl v3.16b, {v3.16b}, v4.16b \n" "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"((ptrdiff_t)-64), // %3 : "r"(&kShuffleMirrorARGB) // %3
"r"(&kShuffleMirror) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
} }
@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3"); : "cc", "memory", "v0", "v1", "v2", "v3");
} }
// Shuffle table for swapping UV bytes.
static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
// Convert UV plane of NV12 to VU of NV21. // Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile( asm volatile(
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n" "1: \n"
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
"orr v2.16b, v0.16b, v0.16b \n" // move U after V "ld1 {v1.16b}, [%0], 16 \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop "subs %w2, %w2, #16 \n" // 16 pixels per loop
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels "tbl v0.16b, {v0.16b}, v2.16b \n"
"tbl v1.16b, {v1.16b}, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"stp q0, q1, [%1], 32 \n" // store 16 VU pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_uv), // %0 : "+r"(src_uv), // %0
"+r"(dst_vu), // %1 "+r"(dst_vu), // %1
"+r"(width) // %2 "+r"(width) // %2
: : "r"(&kShuffleSwapUV) // %3
: "cc", "memory", "v0", "v1", "v2"); : "cc", "memory", "v0", "v1", "v2");
} }

View File

@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0) SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \

View File

@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
} }
} }
TEST_F(LibYUVPlanarTest, TestARGBMirror) { TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
SIMD_ALIGNED(uint8_t dst_pixels[1280][4]); align_buffer_page_end(dst_pixels_opt,
benchmark_width_ * benchmark_height_ * 4);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
for (int i = 0; i < 1280; ++i) { MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
orig_pixels[i][0] = i; MaskCpuFlags(disable_cpu_flags_);
orig_pixels[i][1] = i / 2; ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
orig_pixels[i][2] = i / 3; benchmark_width_ * 4, benchmark_width_, benchmark_height_);
orig_pixels[i][3] = i / 4; MaskCpuFlags(benchmark_cpu_info_);
}
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]); ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]); benchmark_width_ * 4, benchmark_width_, benchmark_height_);
EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
} }
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
} }
TEST_F(LibYUVPlanarTest, TestMirrorPlane) { TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
SIMD_ALIGNED(uint8_t orig_pixels[1280]); align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
SIMD_ALIGNED(uint8_t dst_pixels[1280]); align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
for (int i = 0; i < 1280; ++i) { MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
orig_pixels[i] = i; MaskCpuFlags(disable_cpu_flags_);
} MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1); benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]); MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
benchmark_width_, benchmark_height_);
} }
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
align_buffer_page_end(dst_pixels_opt,
benchmark_width_ * benchmark_height_ * 2);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
MaskCpuFlags(disable_cpu_flags_);
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
} }
TEST_F(LibYUVPlanarTest, TestShade) { TEST_F(LibYUVPlanarTest, TestShade) {