NV12Mirror and MirrorUVPlane functions added

HalfMergeUV AVX2 version

Skylake Xeon performance for 1280x720
NV12Mirror_Any (109 ms)
NV12Mirror_Unaligned (113 ms)
NV12Mirror_Invert (107 ms)
NV12Mirror_Opt (108 ms)
NV12Mirror_NullY (19 ms)

Slightly faster than comparable I420Mirror
I420Mirror_Any (113 ms)
I420Mirror_Unaligned (110 ms)
I420Mirror_Invert (109 ms)
I420Mirror_Opt (110 ms)

BUG=libyuv:840, libyuv:858

Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2020-05-04 12:32:28 -07:00 committed by Commit Bot
parent d9681c53b3
commit 7a61759f78
17 changed files with 432 additions and 87 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1749
Version: 1751
License: BSD
License File: LICENSE

View File

@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
Most NV12 functions allow the destination Y pointer to be NULL.

View File

@ -190,7 +190,7 @@ mips
make V=1 -f linux.mk
make V=1 -f linux.mk clean
make V=1 -f linux.mk CXX=clang++
make V=1 -f linux.mk CXX=clang++ CC=clang
## Building the library with cmake

View File

@ -314,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
int width,
int height);
// Alias
#define NV12ToNV12Mirror NV12Mirror
// NV12 mirror.
LIBYUV_API
int NV12Mirror(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Alias
#define ARGBToARGBMirror ARGBMirror
@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
int width,
int height);
// Mirror a plane of UV data.
LIBYUV_API
void MirrorUVPlane(const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8_t* src_y,

View File

@ -274,16 +274,18 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_HALFMERGEUVROW_SSSE3
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOAR30ROW_SSSE3
#define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3
#define HAS_MIRRORUVROW_AVX2
#define HAS_MIRRORUVROW_SSSE3
#define HAS_RAWTORGBAROW_SSSE3
#define HAS_RGB24MIRRORROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
@ -299,6 +301,7 @@ extern "C" {
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_HALFMERGEUVROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
@ -368,6 +371,7 @@ extern "C" {
#define HAS_J400TOARGBROW_NEON
#define HAS_MERGEUVROW_NEON
#define HAS_MIRRORROW_NEON
#define HAS_MIRRORUVROW_NEON
#define HAS_MIRRORSPLITUVROW_NEON
#define HAS_NV12TOARGBROW_NEON
#define HAS_NV12TORGB24ROW_NEON
@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
uint8_t* dst_uv,
int width);
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_uv,
int width);
void SplitRGBRow_C(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1749
#define LIBYUV_VERSION 1751
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
}
}
// Mirror a plane of UV data.
LIBYUV_API
void MirrorUVPlane(const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
MirrorUVRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uv = src_uv + (height - 1) * src_stride_uv;
src_stride_uv = -src_stride_uv;
}
#if defined(HAS_MIRRORUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorUVRow = MirrorUVRow_Any_NEON;
if (IS_ALIGNED(width, 32)) {
MirrorUVRow = MirrorUVRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorUVRow = MirrorUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
MirrorUVRow = MirrorUVRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorUVRow = MirrorUVRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MirrorUVRow = MirrorUVRow_AVX2;
}
}
#endif
// MirrorUV plane
for (y = 0; y < height; ++y) {
MirrorUVRow(src_uv, dst_uv, width);
src_uv += src_stride_uv;
dst_uv += dst_stride_uv;
}
}
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,
@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
return 0;
}
// NV12 mirror.
LIBYUV_API
int NV12Mirror(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_y || !src_uv || !dst_uv || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_uv = src_uv + (halfheight - 1) * src_stride_uv;
src_stride_y = -src_stride_y;
src_stride_uv = -src_stride_uv;
}
if (dst_y) {
MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
halfheight);
return 0;
}
// ARGB mirror.
LIBYUV_API
int ARGBMirror(const uint8_t* src_argb,
@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}
@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
HalfMergeUVRow = HalfMergeUVRow_SSSE3;
}
#endif
#if defined(HAS_HALFMERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
HalfMergeUVRow = HalfMergeUVRow_AVX2;
}
#endif
for (y = 0; y < height - 1; y += 2) {
// Merge a row of U and V into a row of UV.
HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);

View File

@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
int width) = MirrorSplitUVRow_C;
#if defined(HAS_MIRRORSPLITUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
MirrorSplitUVRow = MirrorSplitUVRow_NEON;
}
#endif

View File

@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
#if defined(HAS_ARGBMIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 8)) {
ARGBMirrorRow = ARGBMirrorRow_NEON;
}
}

View File

@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
#ifdef HAS_MIRRORROW_MMI
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
#endif
#ifdef HAS_MIRRORUVROW_AVX2
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
#endif
#ifdef HAS_MIRRORUVROW_SSSE3
ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
#endif
#ifdef HAS_MIRRORUVROW_NEON
ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
#endif
#ifdef HAS_ARGBMIRRORROW_AVX2
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
#endif
@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
#endif
#ifdef HAS_ARGBMIRRORROW_NEON
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15)
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
#endif
#ifdef HAS_ARGBMIRRORROW_MSA
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)

View File

@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
}
}
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
int x;
src_uv += (width - 1) << 1;
for (x = 0; x < width; ++x) {
dst_uv[0] = src_uv[0];
dst_uv[1] = src_uv[1];
src_uv -= 2;
dst_uv += 2;
}
}
void MirrorSplitUVRow_C(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,

View File

@ -3229,9 +3229,61 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
}
#endif // HAS_MIRRORROW_AVX2
#ifdef HAS_MIRRORUVROW_SSSE3
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
"movdqa %3,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu -0x10(%0,%2,2),%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirrorUV) // %3
: "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORUVROW_SSSE3
#ifdef HAS_MIRRORUVROW_AVX2
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width);
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirrorUV) // %3
: "memory", "cc", "xmm0", "xmm5");
}
#endif // HAS_MIRRORUVROW_AVX2
#ifdef HAS_MIRRORSPLITUVROW_SSSE3
// Shuffle table for reversing the bytes of UV channels.
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
@ -3257,7 +3309,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4
: "m"(kShuffleMirrorSplitUV) // %4
: "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_MIRRORSPLITUVROW_SSSE3
@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_uv,
int width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"1: \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // load 32 U values
"vmovdqu (%1),%%ymm1 \n" // load 32 V values
"vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
"vmovdqu 0(%1,%5,1),%%ymm3 \n"
"lea 0x20(%0),%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x20(%1),%1 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
"lea 0x20(%2),%2 \n"
"sub $0x20,%3 \n" // 32 src pixels per loop
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride_u)), // %4
"r"((intptr_t)(src_stride_v)) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
: "cc", "memory", "q0", "q1", "q2");
}
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %2, lsl #1 \n"
"sub %0, #16 \n"
"1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %2, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
"vst2.8 {d0, d1}, [%1]! \n" // dst += 16
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
:
: "cc", "memory", "r12", "q0");
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,

View File

@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
// Start at end of source row.
"ld1 {v3.16b}, [%4] \n" // shuffler
"ld1 {v3.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw \n"
"sub %0, %0, #32 \n"
"1: \n"
"ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32
"ldr q2, [%0, 16] \n"
"ldr q1, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #32 \n" // 32 pixels per loop.
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v0.16b, {v2.16b}, v3.16b \n"
"tbl v1.16b, {v1.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)-32), // %3
"r"(&kShuffleMirror) // %4
: "r"(&kShuffleMirror) // %3
: "cc", "memory", "v0", "v1", "v2", "v3");
}
// Shuffle table for reversing the UV.
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile(
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #1 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"(&kShuffleMirrorUV) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile(
// Start at end of source row.
"ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w3, sxtw #1 \n"
"sub %0, %0, #16 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // dst += 8
"st1 {v1.8b}, [%2], #8 \n"
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w3, %w3, #16 \n" // 16 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"uzp1 v0.16b, v2.16b, v3.16b \n" // U
"uzp2 v1.16b, v2.16b, v3.16b \n" // V
"st1 {v0.16b}, [%1], #16 \n" // dst += 16
"st1 {v1.16b}, [%2], #16 \n"
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "v0", "v1");
: "r"(&kShuffleMirrorUV) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
// Shuffle table for reversing the ARGB.
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile(
"ld1 {v4.16b}, [%4] \n" // shuffler
"add %0, %0, %w2, sxtw #2 \n" // Start at end of row.
"sub %0, %0, #64 \n"
// Start at end of source row.
"ld1 {v4.16b}, [%3] \n" // shuffler
"add %0, %0, %w2, sxtw #2 \n"
"sub %0, %0, #32 \n"
"1: \n"
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v1.16b, {v1.16b}, v4.16b \n"
"tbl v2.16b, {v2.16b}, v4.16b \n"
"tbl v3.16b, {v3.16b}, v4.16b \n"
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64
"ldr q1, [%0, 16] \n"
"ldr q0, [%0], -32 \n" // src -= 32
"subs %w2, %w2, #8 \n" // 8 pixels per loop.
"tbl v2.16b, {v1.16b}, v4.16b \n"
"tbl v3.16b, {v0.16b}, v4.16b \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)-64), // %3
"r"(&kShuffleMirror) // %4
: "r"(&kShuffleMirrorARGB) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3");
}
// Shuffle table for swapping UV bytes.
static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"ld1 {v2.16b}, [%3] \n" // shuffler
"1: \n"
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
"ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
"ld1 {v1.16b}, [%0], 16 \n"
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
"tbl v0.16b, {v0.16b}, v2.16b \n"
"tbl v1.16b, {v1.16b}, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"stp q0, q1, [%1], 32 \n" // store 16 VU pixels
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
:
: "r"(&kShuffleSwapUV) // %3
: "cc", "memory", "v0", "v1", "v2");
}

View File

@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \

View File

@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
}
}
TEST_F(LibYUVPlanarTest, TestARGBMirror) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
align_buffer_page_end(dst_pixels_opt,
benchmark_width_ * benchmark_height_ * 4);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i / 4;
}
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
MaskCpuFlags(disable_cpu_flags_);
ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
for (int i = 0; i < benchmark_iterations_; ++i) {
ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
SIMD_ALIGNED(uint8_t orig_pixels[1280]);
SIMD_ALIGNED(uint8_t dst_pixels[1280]);
TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i] = i;
}
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
MaskCpuFlags(disable_cpu_flags_);
MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
for (int i = 0; i < benchmark_iterations_; ++i) {
MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
benchmark_width_, benchmark_height_);
}
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
align_buffer_page_end(dst_pixels_opt,
benchmark_width_ * benchmark_height_ * 2);
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
MaskCpuFlags(disable_cpu_flags_);
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
}
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, TestShade) {