Upstream all libyuv changes to version 1746 Prefetch for all arm functions - helps performance at higher resolutions Make MirrorPlane function public.

Bug: libyuv:855
Change-Id: I4020face6b52767ee78d81870314285d63e98b95
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2113650
Reviewed-by: Hsiu Wang <hsiu@google.com>
This commit is contained in:
Frank Barchard 2020-03-20 15:22:53 -07:00 committed by Frank Barchard
parent 45f1f2b201
commit b5e223ac4c
14 changed files with 1816 additions and 1493 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1744
Version: 1746
License: BSD
License File: LICENSE

View File

@ -313,6 +313,15 @@ int ARGBMirror(const uint8_t* src_argb,
int width,
int height);
// Mirror a plane of data.
LIBYUV_API
void MirrorPlane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int width,
int height);
// Convert NV12 to RGB565.
LIBYUV_API
int NV12ToRGB565(const uint8_t* src_y,

View File

@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
int width,
int height);
// Rotations for when U and V are interleaved.
// These functions take one input pointer and
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
void RotateUV90(const uint8_t* src,
int src_stride,
@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
int width,
int height);
// Rotations for when U and V are interleaved.
// These functions take one input pointer and
// split the data into two buffers while
// rotating them. Deprecated.
LIBYUV_API
void RotateUV180(const uint8_t* src,
int src_stride,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1744
#define LIBYUV_VERSION 1746
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -33,8 +33,10 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
@ -65,8 +67,10 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"prfm pldl1keep, [%1, 448] \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"

View File

@ -716,70 +716,6 @@ void MergeRGBPlane(const uint8_t* src_r,
}
}
// Mirror a plane of data.
void MirrorPlane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int width,
int height) {
int y;
void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorRow = MirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_AVX2;
}
}
#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
if (IS_ALIGNED(width, 64)) {
MirrorRow = MirrorRow_MSA;
}
}
#endif
#if defined(HAS_MIRRORROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MirrorRow = MirrorRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
MirrorRow = MirrorRow_MMI;
}
}
#endif
// Mirror plane
for (y = 0; y < height; ++y) {
MirrorRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
// Convert YUY2 to I422.
LIBYUV_API
int YUY2ToI422(const uint8_t* src_yuy2,
@ -1047,6 +983,68 @@ int YUY2ToY(const uint8_t* src_yuy2,
return 0;
}
// Mirror a plane of data.
// See Also I400Mirror
LIBYUV_API
void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y,
int dst_stride_y, int width, int height) {
int y;
void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MirrorRow = MirrorRow_Any_NEON;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_NEON;
}
}
#endif
#if defined(HAS_MIRRORROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
MirrorRow = MirrorRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
MirrorRow = MirrorRow_SSSE3;
}
}
#endif
#if defined(HAS_MIRRORROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MirrorRow = MirrorRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MirrorRow = MirrorRow_AVX2;
}
}
#endif
#if defined(HAS_MIRRORROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
MirrorRow = MirrorRow_Any_MSA;
if (IS_ALIGNED(width, 64)) {
MirrorRow = MirrorRow_MSA;
}
}
#endif
#if defined(HAS_MIRRORROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
MirrorRow = MirrorRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
MirrorRow = MirrorRow_MMI;
}
}
#endif
// Mirror plane
for (y = 0; y < height; ++y) {
MirrorRow(src_y, dst_y, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
// Mirror I400 with optional flipping
LIBYUV_API
int I400Mirror(const uint8_t* src_y,

View File

@ -21,17 +21,21 @@ namespace libyuv {
extern "C" {
#endif
static void ARGBTranspose(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
static int ARGBTranspose(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
int i;
int src_pixel_step = src_stride_argb >> 2;
void (*ScaleARGBRowDownEven)(
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
// Check stride is a multiple of 4.
if (src_stride_argb & 3) {
return -1;
}
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
dst_argb += dst_stride_argb;
src_argb += 4;
}
return 0;
}
void ARGBRotate90(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
static int ARGBRotate90(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
// Rotate by 90 is a ARGBTranspose with the source read
// from bottom to top. So set the source pointer to the end
// of the buffer and flip the sign of the source stride.
src_argb += src_stride_argb * (height - 1);
src_stride_argb = -src_stride_argb;
ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
}
void ARGBRotate270(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
static int ARGBRotate270(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
// Rotate by 270 is a ARGBTranspose with the destination written
// from bottom to top. So set the destination pointer to the end
// of the buffer and flip the sign of the destination stride.
dst_argb += dst_stride_argb * (width - 1);
dst_stride_argb = -dst_stride_argb;
ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
}
void ARGBRotate180(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
static int ARGBRotate180(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_argb,
int dst_stride_argb,
int width,
int height) {
// Swap first and last row and mirror the content. Uses a temporary row.
align_buffer_64(row, width * 4);
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
dst_bot -= dst_stride_argb;
}
free_aligned_buffer_64(row);
return 0;
}
LIBYUV_API
@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate90:
ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate270:
ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
case kRotate180:
ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
height);
return 0;
return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
width, height);
default:
break;
}

View File

@ -37,7 +37,7 @@ void TransposeWx8_NEON(const uint8_t* src,
"sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"1: \n"
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
@ -48,23 +48,39 @@ void TransposeWx8_NEON(const uint8_t* src,
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n"
"trn2 v16.8b, v0.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"trn1 v17.8b, v0.8b, v1.8b \n"
"add %0, %0, %5 \n"
"trn2 v18.8b, v2.8b, v3.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 1
"trn1 v19.8b, v2.8b, v3.8b \n"
"add %0, %0, %5 \n"
"trn2 v20.8b, v4.8b, v5.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 2
"trn1 v21.8b, v4.8b, v5.8b \n"
"add %0, %0, %5 \n"
"trn2 v22.8b, v6.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 3
"trn1 v23.8b, v6.8b, v7.8b \n"
"add %0, %0, %5 \n"
"trn2 v3.4h, v17.4h, v19.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 4
"trn1 v1.4h, v17.4h, v19.4h \n"
"add %0, %0, %5 \n"
"trn2 v2.4h, v16.4h, v18.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 5
"trn1 v0.4h, v16.4h, v18.4h \n"
"add %0, %0, %5 \n"
"trn2 v7.4h, v21.4h, v23.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 6
"trn1 v5.4h, v21.4h, v23.4h \n"
"add %0, %0, %5 \n"
"trn2 v6.4h, v20.4h, v22.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 7
"trn1 v4.4h, v20.4h, v22.4h \n"
"trn2 v21.2s, v1.2s, v5.2s \n"
@ -226,6 +242,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
"mov %0, %1 \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"

View File

@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u};
0x8080u, 0x8080u, 0x8080u, 0x8080u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@ -1101,8 +1101,11 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"lea 0x40(%0),%0 \n" \
"phaddw %%xmm0,%%xmm6 \n" \
"phaddw %%xmm2,%%xmm1 \n" \
"paddw %%" #round ",%%xmm6 \n" \
"paddw %%" #round ",%%xmm1 \n" \
"prefetcht0 1280(%0) \n" \
"paddw %%" #round \
",%%xmm6 \n" \
"paddw %%" #round \
",%%xmm1 \n" \
"psrlw $0x8,%%xmm6 \n" \
"psrlw $0x8,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm6 \n" \
@ -1111,33 +1114,36 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
"sub $0x10,%2 \n" \
"jg 1b \n"
#define RGBTOY_AVX2(round) \
"1: \n" \
"vmovdqu (%0),%%ymm0 \n" \
"vmovdqu 0x20(%0),%%ymm1 \n" \
"vmovdqu 0x40(%0),%%ymm2 \n" \
"vmovdqu 0x60(%0),%%ymm3 \n" \
"vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
"vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
"vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
"vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
"lea 0x80(%0),%0 \n" \
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
"vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
"vmovdqu %%ymm0,(%1) \n" \
"lea 0x20(%1),%1 \n" \
"sub $0x20,%2 \n" \
"jg 1b \n" \
#define RGBTOY_AVX2(round) \
"1: \n" \
"vmovdqu (%0),%%ymm0 \n" \
"vmovdqu 0x20(%0),%%ymm1 \n" \
"vmovdqu 0x40(%0),%%ymm2 \n" \
"vmovdqu 0x60(%0),%%ymm3 \n" \
"vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
"vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
"vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
"vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
"lea 0x80(%0),%0 \n" \
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"prefetcht0 1280(%0) \n" \
"vpaddw %%" #round \
",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
"vpaddw %%" #round \
",%%ymm2,%%ymm2 \n" \
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
"vmovdqu %%ymm0,(%1) \n" \
"lea 0x20(%1),%1 \n" \
"sub $0x20,%2 \n" \
"jg 1b \n" \
"vzeroupper \n"
#ifdef HAS_ARGBTOYROW_SSSE3
@ -1148,15 +1154,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN
RGBTOY(xmm7)
LABELALIGN RGBTOY(xmm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ARGBTOYROW_SSSE3
@ -1168,8 +1174,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
LABELALIGN
RGBTOY(xmm5)
LABELALIGN RGBTOY(xmm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@ -1187,8 +1192,7 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
LABELALIGN
RGBTOY(xmm5)
LABELALIGN RGBTOY(xmm5)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@ -1210,8 +1214,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
LABELALIGN
RGBTOY_AVX2(ymm7)
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@ -1219,7 +1222,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ARGBTOYROW_AVX2
@ -1232,8 +1236,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
LABELALIGN
RGBTOY_AVX2(ymm7)
LABELALIGN RGBTOY_AVX2(ymm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@ -1241,7 +1244,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"m"(kSub128), // %4
"m"(kAddY16), // %5
"m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ABGRTOYROW_AVX2
@ -1253,15 +1257,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
RGBTOY_AVX2(ymm5)
LABELALIGN RGBTOY_AVX2(ymm5)
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kARGBToYJ), // %3
"m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // HAS_ARGBTOYJROW_AVX2
@ -1273,9 +1277,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
RGBTOY_AVX2(ymm5)
"vzeroupper \n"
LABELALIGN RGBTOY_AVX2(
ymm5) "vzeroupper \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
@ -1536,7 +1539,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"+r"(dst_v), // %2
"+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kSub128), // %5
"m"(kSub128), // %5
"m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8
@ -1606,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
: "r"((intptr_t)(src_stride_argb)), // %4
"m"(kARGBToVJ), // %5
"m"(kARGBToUJ), // %6
"m"(kSub128) // %7
"m"(kSub128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOUVJROW_SSSE3
@ -1675,15 +1678,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN
RGBTOY(xmm7)
LABELALIGN RGBTOY(xmm7)
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kBGRAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@ -1755,15 +1758,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN
RGBTOY(xmm7)
LABELALIGN RGBTOY(xmm7)
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kABGRToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -1772,15 +1775,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN
RGBTOY(xmm7)
LABELALIGN RGBTOY(xmm7)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToY), // %3
"m"(kSub128), // %4
"m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,

File diff suppressed because it is too large Load Diff

View File

@ -278,7 +278,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
// RGB565.
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@ -315,7 +316,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
// RGB565.
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
@ -401,6 +403,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
"orr v22.8b, v20.8b, v20.8b \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
@ -527,7 +530,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
// RGB565.
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
@ -601,6 +605,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st1 {v0.16b}, [%1], #16 \n" // store U
"st1 {v1.16b}, [%2], #16 \n" // store V
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@ -622,6 +627,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
@ -645,6 +651,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"st1 {v0.16b}, [%1], #16 \n" // store R
"st1 {v1.16b}, [%2], #16 \n" // store G
"st1 {v2.16b}, [%3], #16 \n" // store B
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_r), // %1
@ -669,6 +676,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"ld1 {v2.16b}, [%2], #16 \n" // load B
"subs %w4, %w4, #16 \n" // 16 processed per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
@ -687,6 +695,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 processed per loop
"stp q0, q1, [%1], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -703,6 +712,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@ -716,6 +726,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@ -739,6 +750,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"tbl v1.16b, {v1.16b}, v3.16b \n"
"tbl v0.16b, {v2.16b}, v3.16b \n"
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -763,6 +775,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
"rev64 v1.8b, v1.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // dst += 8
"st1 {v1.8b}, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
@ -783,6 +796,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"rev64 v0.4s, v0.4s \n"
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
"st1 {v0.D}[0], [%1], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -800,6 +814,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
@ -818,6 +833,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
@ -836,6 +852,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
"orr v2.8b, v4.8b, v4.8b \n" // move g
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgba), // %1
@ -853,6 +870,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
"orr v3.8b, v1.8b, v1.8b \n" // move g
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_rgb24), // %1
@ -885,6 +903,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
@ -942,6 +961,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
ARGB1555TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
@ -972,7 +992,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
@ -989,8 +1009,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24.
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
@ -1023,6 +1043,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@ -1038,6 +1059,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@ -1057,6 +1079,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
@ -1077,6 +1100,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
@ -1102,6 +1126,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
@ -1129,6 +1154,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(src_uyvyb), // %1
@ -1153,6 +1179,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #4 \n" // 4 processed per loop
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
"st1 {v1.16b}, [%1], #16 \n" // store 4.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -1175,6 +1202,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@ -1198,6 +1226,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@ -1217,6 +1246,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
ARGBTORGB565
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
@ -1238,6 +1268,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
"uqadd v21.8b, v21.8b, v1.8b \n"
"uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_rgb) // %0
: "r"(src_argb), // %1
@ -1256,6 +1287,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
// ARGB1555.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
@ -1276,6 +1308,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
// ARGB4444.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
@ -1299,6 +1332,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@ -1316,6 +1350,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
// pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_a), // %1
@ -1338,6 +1373,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"umlal v3.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@ -1359,6 +1395,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"umlal v0.8h, v3.8b, v6.8b \n" // R
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@ -1399,6 +1436,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
@ -1767,6 +1805,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(src_rgb565_1), // %1
@ -1832,6 +1871,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(src_argb1555_1), // %1
@ -1897,6 +1937,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(src_argb4444_1), // %1
@ -1927,6 +1968,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
@ -1954,6 +1996,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
@ -1980,6 +2023,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
@ -2003,6 +2047,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
@ -2026,6 +2071,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
@ -2049,6 +2095,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
@ -2072,6 +2119,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
@ -2095,6 +2143,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
@ -2116,6 +2165,7 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
"umlal v0.8h, v2.8b, v6.8b \n" // R
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_yj), // %1
@ -2135,8 +2185,10 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"umull v0.8h, v0.8b, v4.8b \n" // B
"umlal v0.8h, v1.8b, v5.8b \n" // G
"umlal v0.8h, v2.8b, v6.8b \n" // R
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines ahead
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_yj), // %1
@ -2174,6 +2226,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
"rshrn v0.8b, v2.8h, #8 \n"
"rshrn2 v0.16b, v3.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"b 99f \n"
@ -2290,6 +2343,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
// pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2331,6 +2385,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
"uqxtn v1.8b, v1.8h \n"
"uqxtn v2.8b, v2.8h \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@ -2369,6 +2424,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
"uqxtn v6.8b, v6.8h \n"
"uqxtn v7.8b, v7.8h \n"
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2395,6 +2451,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"orr v1.8b, v0.8b, v0.8b \n" // G
"orr v2.8b, v0.8b, v0.8b \n" // R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2435,6 +2492,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@ -2495,6 +2553,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -2525,6 +2584,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@ -2550,6 +2610,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
"uqadd v2.8b, v2.8b, v6.8b \n"
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@ -2575,6 +2636,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
"uqsub v2.8b, v2.8b, v6.8b \n"
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@ -2604,6 +2666,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
"orr v1.8b, v0.8b, v0.8b \n"
"orr v2.8b, v0.8b, v0.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@ -2626,6 +2689,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"uqadd v0.16b, v0.16b, v1.16b \n" // add
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@ -2653,6 +2717,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"uqadd v1.8b, v0.8b, v2.8b \n" // add
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@ -2689,6 +2754,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"abs v0.8h, v0.8h \n"
"uqxtn v0.8b, v0.8h \n"
"st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@ -2727,6 +2793,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"abs v0.8h, v0.8h \n"
"uqxtn v0.8b, v0.8h \n"
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@ -2754,6 +2821,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
"fcvtn v1.4h, v2.4s \n" // 8 half floats
"fcvtn2 v1.8h, v3.4s \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -2779,6 +2847,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v1.8h, v3.4s, #13 \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -2803,6 +2872,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"fmul v3.4s, v3.4s, %3.s[0] \n"
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -2828,6 +2898,7 @@ float ScaleMaxSamples_NEON(const float* src,
"fmax v5.4s, v5.4s, v1.4s \n" // max
"fmax v6.4s, v6.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"fmax v5.4s, v5.4s, v6.4s \n" // max
"fmaxv %s3, v5.4s \n" // signed max acculator
@ -2857,6 +2928,7 @@ float ScaleSumSamples_NEON(const float* src,
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
"fmla v6.4s, v2.4s, v2.4s \n"
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
"faddp v5.4s, v5.4s, v6.4s \n"
"faddp v5.4s, v5.4s, v5.4s \n"
@ -2878,6 +2950,7 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -2902,18 +2975,23 @@ void GaussCol_NEON(const uint16_t* src0,
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
"ld1 {v2.8h}, [%4], #16 \n"
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
"ld1 {v2.8h}, [%1], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"prfm pldl1keep, [%1, 448] \n"
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"ld1 {v2.8h}, [%2], #16 \n"
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
"prfm pldl1keep, [%2, 448] \n"
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
"ld1 {v2.8h}, [%3], #16 \n"
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"prfm pldl1keep, [%3, 448] \n"
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
: "+r"(src0), // %0
"+r"(src1), // %1
@ -2946,6 +3024,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
"add v3.4s, v3.4s, v5.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"subs %w5, %w5, #8 \n" // 8 processed per loop
@ -2982,14 +3061,19 @@ void GaussCol_F32_NEON(const float* src0,
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
"fmla v1.4s, v5.4s, v7.4s \n"
"prfm pldl1keep, [%1, 448] \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%4], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
"prfm pldl1keep, [%2, 448] \n"
"fadd v0.4s, v0.4s, v4.4s \n" // * 1
"prfm pldl1keep, [%3, 448] \n"
"fadd v1.4s, v1.4s, v5.4s \n"
"prfm pldl1keep, [%4, 448] \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
@ -3024,6 +3108,7 @@ void GaussRow_F32_NEON(const float* src,
"fadd v3.4s, v3.4s, v5.4s \n"
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"fmla v1.4s, v3.4s, v6.4s \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
"fmul v1.4s, v1.4s, v8.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
@ -3052,6 +3137,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
@ -3079,6 +3165,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"uqrshrn v2.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@ -3107,6 +3194,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"uqrshrn v1.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
@ -3124,6 +3212,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// pixels
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(dst_y), // %1
@ -3140,6 +3229,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1

View File

@ -31,6 +31,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -54,6 +55,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #16 \n" // 16 processed per loop
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@ -82,6 +84,8 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
"rshrn2 v0.16b, v1.8h, #2 \n"
"st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -102,6 +106,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v2.8b}, [%1], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -131,6 +136,10 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
"addp v0.8h, v0.8h, v0.8h \n"
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
"st1 {v0.s}[0], [%1], #4 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
@ -156,7 +165,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #24 \n"
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -211,7 +221,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@ -252,7 +264,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
"uqrshrn v2.8b, v4.8h, #2 \n"
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@ -286,7 +300,8 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%1], #8 \n"
"st1 {v2.s}[2], [%1], #4 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -400,7 +415,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%3, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@ -504,7 +522,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"st1 {v3.8b}, [%1], #8 \n"
"st1 {v3.s}[2], [%1], #4 \n"
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(tmp_src_stride), // %2
@ -528,7 +548,8 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"uaddw v1.8h, v1.8h, v0.8b \n"
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
"subs %w2, %w2, #16 \n" // 16 processed per loop
"b.gt 1b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
@ -599,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"add v1.4s, v1.4s, v0.4s \n"
"add v2.4s, v2.4s, v0.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"b.gt 1b \n"
"b.gt 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width), // %2
@ -647,6 +668,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"rshrn v0.8b, v6.8h, #8 \n"
"rshrn2 v0.16b, v7.8h, #8 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 1b \n"
"b 99f \n"
@ -658,6 +681,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 25b \n"
"b 99f \n"
@ -668,6 +693,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"subs %w3, %w3, #16 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 50b \n"
"b 99f \n"
@ -679,6 +706,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%2, 448] \n"
"b.gt 75b \n"
"b 99f \n"
@ -687,6 +716,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
"ld1 {v0.16b}, [%1], #16 \n"
"subs %w3, %w3, #16 \n"
"st1 {v0.16b}, [%0], #16 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 100b \n"
"99: \n"
@ -713,6 +743,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"subs %w2, %w2, #8 \n" // 8 processed per loop
"mov v2.16b, v3.16b \n"
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst), // %1
@ -736,6 +767,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
"urhadd v1.16b, v2.16b, v3.16b \n"
"st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -769,6 +801,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
"rshrn v2.8b, v2.8h, #2 \n"
"rshrn v3.8b, v3.8h, #2 \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -794,6 +828,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
"ld1 {v0.s}[3], [%0], %3 \n"
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%1], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@ -838,6 +873,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
"st1 {v0.16b}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride), // %1
@ -878,6 +915,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
// clang-format on
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@ -949,7 +987,8 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"st1 {v0.4s}, [%0], #16 \n" // store pixels
"add v5.4s, v5.4s, v6.4s \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"b.gt 1b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@ -984,6 +1023,8 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
"rshrn v0.4h, v0.4s, #2 \n" // round and pack
"rshrn2 v0.8h, v1.4s, #2 \n"
"st1 {v0.8h}, [%2], #16 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1
@ -1032,6 +1073,8 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"uqrshrn v17.4h, v18.4s, #4 \n"
"uqrshrn2 v17.8h, v4.4s, #4 \n"
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"prfm pldl1keep, [%1, 448] \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_stride), // %1

View File

@ -804,6 +804,23 @@ TEST_F(LibYUVPlanarTest, TestARGBMirror) {
}
}
TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
SIMD_ALIGNED(uint8_t orig_pixels[1280]);
SIMD_ALIGNED(uint8_t dst_pixels[1280]);
for (int i = 0; i < 1280; ++i) {
orig_pixels[i] = i;
}
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
for (int i = 0; i < 1280; ++i) {
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
}
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
}
}
TEST_F(LibYUVPlanarTest, TestShade) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
@ -3315,8 +3332,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
}
#else
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
1280);
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
#endif
}
@ -3369,36 +3386,24 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
for (int i = 0; i < 1280 * 5; ++i) {
orig_pixels[i] = static_cast<float>(i);
}
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_c[0], 1280);
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_c[0], 1280);
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
int has_neon = TestCpuFlag(kCpuHasNEON);
if (has_neon) {
GaussCol_F32_NEON(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
&orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
&orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
} else {
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
&orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
&orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
}
#else
GaussCol_F32_C(&orig_pixels[0],
&orig_pixels[1280],
&orig_pixels[1280 * 2],
&orig_pixels[1280 * 3],
&orig_pixels[1280 * 4],
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
&dst_pixels_opt[0], 1280);
#endif
}
@ -3455,18 +3460,18 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
MaskCpuFlags(disable_cpu_flags_);
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
(float*)(dst_pixels_c), benchmark_width_,
benchmark_width_, benchmark_height_);
(float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
(float*)(dst_pixels_opt), benchmark_width_,
benchmark_width_, benchmark_height_);
(float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
benchmark_height_);
}
for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
EXPECT_NEAR(((float*)(dst_pixels_c)) [i],
((float*)(dst_pixels_opt))[i], 1.f) << i;
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
<< i;
}
free_aligned_buffer_page_end(dst_pixels_c);

View File

@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
benchmark_cpu_info_);
}
TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
align_buffer_page_end(src_argb, argb_plane_size);
align_buffer_page_end(dst_argb, argb_plane_size);
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
benchmark_width_ * 4, benchmark_width_,
benchmark_height_, kRotate0));
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
benchmark_width_ * 4 - 1, benchmark_width_ - 1,
benchmark_height_, kRotate0));
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
benchmark_width_ * 4, benchmark_width_,
benchmark_height_, kRotate180));
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
benchmark_width_ * 4 - 1, benchmark_width_ - 1,
benchmark_height_, kRotate180));
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
abs(benchmark_height_) * 4, benchmark_width_,
benchmark_height_, kRotate90));
EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
abs(benchmark_height_) * 4, benchmark_width_ - 1,
benchmark_height_, kRotate90));
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
abs(benchmark_height_) * 4, benchmark_width_,
benchmark_height_, kRotate270));
EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
abs(benchmark_height_) * 4, benchmark_width_ - 1,
benchmark_height_, kRotate270));
free_aligned_buffer_page_end(dst_argb);
free_aligned_buffer_page_end(src_argb);
}
} // namespace libyuv