mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
NV12Mirror and MirrorUVPlane functions added
HalfMergeUV AVX2 version Skylake Xeon performance for 1280x720 NV12Mirror_Any (109 ms) NV12Mirror_Unaligned (113 ms) NV12Mirror_Invert (107 ms) NV12Mirror_Opt (108 ms) NV12Mirror_NullY (19 ms) Slightly faster than comparable I420Mirror I420Mirror_Any (113 ms) I420Mirror_Unaligned (110 ms) I420Mirror_Invert (109 ms) I420Mirror_Opt (110 ms) BUG=libyuv:840, libyuv:858 Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
d9681c53b3
commit
7a61759f78
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1749
|
||||
Version: 1751
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half
|
||||
height chroma channel, and therefore is a 420 subsampling.
|
||||
NV16 is 16 bits per pixel, with half width and full height. aka 422.
|
||||
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
|
||||
Most NV12 functions allow the destination Y pointer to be NULL.
|
||||
|
||||
@ -190,7 +190,7 @@ mips
|
||||
|
||||
make V=1 -f linux.mk
|
||||
make V=1 -f linux.mk clean
|
||||
make V=1 -f linux.mk CXX=clang++
|
||||
make V=1 -f linux.mk CXX=clang++ CC=clang
|
||||
|
||||
## Building the library with cmake
|
||||
|
||||
|
||||
@ -314,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Alias
|
||||
#define NV12ToNV12Mirror NV12Mirror
|
||||
|
||||
// NV12 mirror.
|
||||
LIBYUV_API
|
||||
int NV12Mirror(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Alias
|
||||
#define ARGBToARGBMirror ARGBMirror
|
||||
|
||||
@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Mirror a plane of UV data.
|
||||
LIBYUV_API
|
||||
void MirrorUVPlane(const uint8_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert NV12 to RGB565.
|
||||
LIBYUV_API
|
||||
int NV12ToRGB565(const uint8_t* src_y,
|
||||
|
||||
@ -274,16 +274,18 @@ extern "C" {
|
||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||
#define HAS_CONVERT8TO16ROW_SSE2
|
||||
#define HAS_HALFMERGEUVROW_SSSE3
|
||||
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
|
||||
#define HAS_I210TOAR30ROW_SSSE3
|
||||
#define HAS_I210TOARGBROW_SSSE3
|
||||
#define HAS_I422TOAR30ROW_SSSE3
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_MIRRORUVROW_AVX2
|
||||
#define HAS_MIRRORUVROW_SSSE3
|
||||
#define HAS_RAWTORGBAROW_SSSE3
|
||||
#define HAS_RGB24MIRRORROW_SSSE3
|
||||
#define HAS_RGBATOYJROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#define HAS_SWAPUVROW_SSSE3
|
||||
|
||||
#endif
|
||||
|
||||
// The following are available for AVX2 gcc/clang x86 platforms:
|
||||
@ -299,6 +301,7 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB24ROW_AVX2
|
||||
#define HAS_CONVERT16TO8ROW_AVX2
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
#define HAS_HALFMERGEUVROW_AVX2
|
||||
#define HAS_I210TOAR30ROW_AVX2
|
||||
#define HAS_I210TOARGBROW_AVX2
|
||||
#define HAS_I422TOAR30ROW_AVX2
|
||||
@ -368,6 +371,7 @@ extern "C" {
|
||||
#define HAS_J400TOARGBROW_NEON
|
||||
#define HAS_MERGEUVROW_NEON
|
||||
#define HAS_MIRRORROW_NEON
|
||||
#define HAS_MIRRORUVROW_NEON
|
||||
#define HAS_MIRRORSPLITUVROW_NEON
|
||||
#define HAS_NV12TOARGBROW_NEON
|
||||
#define HAS_NV12TORGB24ROW_NEON
|
||||
@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
|
||||
void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
|
||||
void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
|
||||
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
|
||||
uint8_t* dst_u,
|
||||
@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
|
||||
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
|
||||
void SplitRGBRow_C(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1749
|
||||
#define LIBYUV_VERSION 1751
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror a plane of UV data.
|
||||
LIBYUV_API
|
||||
void MirrorUVPlane(const uint8_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
|
||||
MirrorUVRow_C;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_uv = src_uv + (height - 1) * src_stride_uv;
|
||||
src_stride_uv = -src_stride_uv;
|
||||
}
|
||||
#if defined(HAS_MIRRORUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MirrorUVRow = MirrorUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorUVRow = MirrorUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
MirrorUVRow = MirrorUVRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MirrorUVRow = MirrorUVRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MirrorUVRow = MirrorUVRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MirrorUVRow = MirrorUVRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// MirrorUV plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
MirrorUVRow(src_uv, dst_uv, width);
|
||||
src_uv += src_stride_uv;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror I400 with optional flipping
|
||||
LIBYUV_API
|
||||
int I400Mirror(const uint8_t* src_y,
|
||||
@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
|
||||
if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// NV12 mirror.
|
||||
LIBYUV_API
|
||||
int NV12Mirror(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
if (!src_y || !src_uv || !dst_uv || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_uv = src_uv + (halfheight - 1) * src_stride_uv;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_uv = -src_stride_uv;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
|
||||
halfheight);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ARGB mirror.
|
||||
LIBYUV_API
|
||||
int ARGBMirror(const uint8_t* src_argb,
|
||||
@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
|
||||
#if defined(HAS_ARGBMIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
|
||||
HalfMergeUVRow = HalfMergeUVRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_HALFMERGEUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
|
||||
HalfMergeUVRow = HalfMergeUVRow_AVX2;
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
// Merge a row of U and V into a row of UV.
|
||||
HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
|
||||
|
||||
@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
|
||||
void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
|
||||
int width) = MirrorSplitUVRow_C;
|
||||
#if defined(HAS_MIRRORSPLITUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
|
||||
MirrorSplitUVRow = MirrorSplitUVRow_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
|
||||
#if defined(HAS_ARGBMIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBMirrorRow = ARGBMirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
|
||||
#ifdef HAS_MIRRORROW_MMI
|
||||
ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_MIRRORUVROW_AVX2
|
||||
ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_MIRRORUVROW_SSSE3
|
||||
ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_MIRRORUVROW_NEON
|
||||
ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_AVX2
|
||||
ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
|
||||
#endif
|
||||
@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
|
||||
ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_NEON
|
||||
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15)
|
||||
ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_ARGBMIRRORROW_MSA
|
||||
ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
|
||||
|
||||
@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
int x;
|
||||
src_uv += (width - 1) << 1;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_uv[0] = src_uv[0];
|
||||
dst_uv[1] = src_uv[1];
|
||||
src_uv -= 2;
|
||||
dst_uv += 2;
|
||||
}
|
||||
}
|
||||
|
||||
void MirrorSplitUVRow_C(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
|
||||
@ -3229,9 +3229,61 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
}
|
||||
#endif // HAS_MIRRORROW_AVX2
|
||||
|
||||
#ifdef HAS_MIRRORUVROW_SSSE3
|
||||
// Shuffle table for reversing the UV.
|
||||
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
|
||||
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
|
||||
|
||||
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
intptr_t temp_width = (intptr_t)(width);
|
||||
asm volatile(
|
||||
|
||||
"movdqa %3,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu -0x10(%0,%2,2),%%xmm0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(temp_width) // %2
|
||||
: "m"(kShuffleMirrorUV) // %3
|
||||
: "memory", "cc", "xmm0", "xmm5");
|
||||
}
|
||||
#endif // HAS_MIRRORUVROW_SSSE3
|
||||
|
||||
#ifdef HAS_MIRRORUVROW_AVX2
|
||||
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
intptr_t temp_width = (intptr_t)(width);
|
||||
asm volatile(
|
||||
|
||||
"vbroadcastf128 %3,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu -0x20(%0,%2,2),%%ymm0 \n"
|
||||
"vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0x4e,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(temp_width) // %2
|
||||
: "m"(kShuffleMirrorUV) // %3
|
||||
: "memory", "cc", "xmm0", "xmm5");
|
||||
}
|
||||
#endif // HAS_MIRRORUVROW_AVX2
|
||||
|
||||
#ifdef HAS_MIRRORSPLITUVROW_SSSE3
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
|
||||
static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
|
||||
15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
|
||||
void MirrorSplitUVRow_SSSE3(const uint8_t* src,
|
||||
uint8_t* dst_u,
|
||||
@ -3257,7 +3309,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(temp_width) // %3
|
||||
: "m"(kShuffleMirrorUV) // %4
|
||||
: "m"(kShuffleMirrorSplitUV) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1");
|
||||
}
|
||||
#endif // HAS_MIRRORSPLITUVROW_SSSE3
|
||||
@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
|
||||
void HalfMergeUVRow_AVX2(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"1: \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n" // load 32 U values
|
||||
"vmovdqu (%1),%%ymm1 \n" // load 32 V values
|
||||
"vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row
|
||||
"vmovdqu 0(%1,%5,1),%%ymm3 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size
|
||||
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
|
||||
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels
|
||||
"lea 0x20(%2),%2 \n"
|
||||
"sub $0x20,%3 \n" // 32 src pixels per loop
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((intptr_t)(src_stride_u)), // %4
|
||||
"r"((intptr_t)(src_stride_v)) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"mov r12, #-16 \n"
|
||||
"add %0, %0, %2, lsl #1 \n"
|
||||
"sub %0, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
|
||||
"subs %2, #8 \n" // 8 pixels per loop.
|
||||
"vrev64.8 q0, q0 \n"
|
||||
"vst2.8 {d0, d1}, [%1]! \n" // dst += 16
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "r12", "q0");
|
||||
}
|
||||
|
||||
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
|
||||
@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
|
||||
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"ld1 {v3.16b}, [%4] \n" // shuffler
|
||||
"ld1 {v3.16b}, [%3] \n" // shuffler
|
||||
"add %0, %0, %w2, sxtw \n"
|
||||
"sub %0, %0, #32 \n"
|
||||
"1: \n"
|
||||
"ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32
|
||||
"ldr q2, [%0, 16] \n"
|
||||
"ldr q1, [%0], -32 \n" // src -= 32
|
||||
"subs %w2, %w2, #32 \n" // 32 pixels per loop.
|
||||
"tbl v1.16b, {v1.16b}, v3.16b \n"
|
||||
"tbl v0.16b, {v2.16b}, v3.16b \n"
|
||||
"tbl v1.16b, {v1.16b}, v3.16b \n"
|
||||
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((ptrdiff_t)-32), // %3
|
||||
"r"(&kShuffleMirror) // %4
|
||||
: "r"(&kShuffleMirror) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
// Shuffle table for reversing the UV.
|
||||
static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
|
||||
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
|
||||
|
||||
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"ld1 {v4.16b}, [%3] \n" // shuffler
|
||||
"add %0, %0, %w2, sxtw #1 \n"
|
||||
"sub %0, %0, #32 \n"
|
||||
"1: \n"
|
||||
"ldr q1, [%0, 16] \n"
|
||||
"ldr q0, [%0], -32 \n" // src -= 32
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
|
||||
"tbl v2.16b, {v1.16b}, v4.16b \n"
|
||||
"tbl v3.16b, {v0.16b}, v4.16b \n"
|
||||
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&kShuffleMirrorUV) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
}
|
||||
|
||||
void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"ld1 {v4.16b}, [%4] \n" // shuffler
|
||||
"add %0, %0, %w3, sxtw #1 \n"
|
||||
"sub %0, %0, #16 \n"
|
||||
"sub %0, %0, #32 \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
|
||||
"rev64 v0.8b, v0.8b \n"
|
||||
"rev64 v1.8b, v1.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // dst += 8
|
||||
"st1 {v1.8b}, [%2], #8 \n"
|
||||
"ldr q1, [%0, 16] \n"
|
||||
"ldr q0, [%0], -32 \n" // src -= 32
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop.
|
||||
"tbl v2.16b, {v1.16b}, v4.16b \n"
|
||||
"tbl v3.16b, {v0.16b}, v4.16b \n"
|
||||
"uzp1 v0.16b, v2.16b, v3.16b \n" // U
|
||||
"uzp2 v1.16b, v2.16b, v3.16b \n" // V
|
||||
"st1 {v0.16b}, [%1], #16 \n" // dst += 16
|
||||
"st1 {v1.16b}, [%2], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((ptrdiff_t)-16) // %4
|
||||
: "cc", "memory", "v0", "v1");
|
||||
: "r"(&kShuffleMirrorUV) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
}
|
||||
|
||||
// Shuffle table for reversing the ARGB.
|
||||
static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
|
||||
4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
|
||||
|
||||
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
asm volatile(
|
||||
"ld1 {v4.16b}, [%4] \n" // shuffler
|
||||
"add %0, %0, %w2, sxtw #2 \n" // Start at end of row.
|
||||
"sub %0, %0, #64 \n"
|
||||
// Start at end of source row.
|
||||
"ld1 {v4.16b}, [%3] \n" // shuffler
|
||||
"add %0, %0, %w2, sxtw #2 \n"
|
||||
"sub %0, %0, #32 \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
|
||||
"tbl v0.16b, {v0.16b}, v4.16b \n"
|
||||
"tbl v1.16b, {v1.16b}, v4.16b \n"
|
||||
"tbl v2.16b, {v2.16b}, v4.16b \n"
|
||||
"tbl v3.16b, {v3.16b}, v4.16b \n"
|
||||
"st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64
|
||||
"ldr q1, [%0, 16] \n"
|
||||
"ldr q0, [%0], -32 \n" // src -= 32
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop.
|
||||
"tbl v2.16b, {v1.16b}, v4.16b \n"
|
||||
"tbl v3.16b, {v0.16b}, v4.16b \n"
|
||||
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((ptrdiff_t)-64), // %3
|
||||
"r"(&kShuffleMirror) // %4
|
||||
: "r"(&kShuffleMirrorARGB) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
}
|
||||
|
||||
@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
// Shuffle table for swapping UV bytes.
|
||||
static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
|
||||
9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
|
||||
|
||||
// Convert UV plane of NV12 to VU of NV21.
|
||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
asm volatile(
|
||||
"ld1 {v2.16b}, [%3] \n" // shuffler
|
||||
"1: \n"
|
||||
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
|
||||
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
|
||||
"ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
|
||||
"ld1 {v1.16b}, [%0], 16 \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop
|
||||
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
|
||||
"tbl v0.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v1.16b, {v1.16b}, v2.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"stp q0, q1, [%1], 32 \n" // store 16 VU pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_vu), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "r"(&kShuffleSwapUV) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
|
||||
|
||||
@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
|
||||
|
||||
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
|
||||
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
|
||||
|
||||
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||
|
||||
@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestARGBMirror) {
|
||||
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
|
||||
SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
|
||||
TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
|
||||
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
|
||||
align_buffer_page_end(dst_pixels_opt,
|
||||
benchmark_width_ * benchmark_height_ * 4);
|
||||
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
orig_pixels[i][0] = i;
|
||||
orig_pixels[i][1] = i / 2;
|
||||
orig_pixels[i][2] = i / 3;
|
||||
orig_pixels[i][3] = i / 4;
|
||||
}
|
||||
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
|
||||
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
|
||||
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
|
||||
EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
|
||||
EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
|
||||
EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
|
||||
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
|
||||
}
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
free_aligned_buffer_page_end(src_pixels);
|
||||
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
|
||||
SIMD_ALIGNED(uint8_t orig_pixels[1280]);
|
||||
SIMD_ALIGNED(uint8_t dst_pixels[1280]);
|
||||
TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
|
||||
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
|
||||
align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
|
||||
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
|
||||
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
}
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
free_aligned_buffer_page_end(src_pixels);
|
||||
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
|
||||
align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
|
||||
align_buffer_page_end(dst_pixels_opt,
|
||||
benchmark_width_ * benchmark_height_ * 2);
|
||||
align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
|
||||
|
||||
MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
|
||||
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
|
||||
benchmark_width_ * 2, benchmark_width_, benchmark_height_);
|
||||
}
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
free_aligned_buffer_page_end(src_pixels);
|
||||
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestShade) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user