mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Upstream all libyuv changes to version 1746 Prefetch for all arm functions - helps performance at higher resolutions Make MirrorPlane function public.
Bug: libyuv:855 Change-Id: I4020face6b52767ee78d81870314285d63e98b95 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2113650 Reviewed-by: Hsiu Wang <hsiu@google.com>
This commit is contained in:
parent
45f1f2b201
commit
b5e223ac4c
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1744
|
||||
Version: 1746
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -313,6 +313,15 @@ int ARGBMirror(const uint8_t* src_argb,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Mirror a plane of data.
|
||||
LIBYUV_API
|
||||
void MirrorPlane(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert NV12 to RGB565.
|
||||
LIBYUV_API
|
||||
int NV12ToRGB565(const uint8_t* src_y,
|
||||
|
||||
@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Rotations for when U and V are interleaved.
|
||||
// These functions take one input pointer and
|
||||
// split the data into two buffers while
|
||||
// rotating them. Deprecated.
|
||||
LIBYUV_API
|
||||
void RotateUV90(const uint8_t* src,
|
||||
int src_stride,
|
||||
@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Rotations for when U and V are interleaved.
|
||||
// These functions take one input pointer and
|
||||
// split the data into two buffers while
|
||||
// rotating them. Deprecated.
|
||||
LIBYUV_API
|
||||
void RotateUV180(const uint8_t* src,
|
||||
int src_stride,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1744
|
||||
#define LIBYUV_VERSION 1746
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -33,8 +33,10 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
@ -65,8 +67,10 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
@ -716,70 +716,6 @@ void MergeRGBPlane(const uint8_t* src_r,
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror a plane of data.
|
||||
void MirrorPlane(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_stride_y = -src_stride_y;
|
||||
}
|
||||
#if defined(HAS_MIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MirrorRow = MirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorRow = MirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
MirrorRow = MirrorRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MirrorRow = MirrorRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MirrorRow = MirrorRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorRow = MirrorRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MirrorRow = MirrorRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
MirrorRow = MirrorRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_MMI)
|
||||
if (TestCpuFlag(kCpuHasMMI)) {
|
||||
MirrorRow = MirrorRow_Any_MMI;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MirrorRow = MirrorRow_MMI;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Mirror plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
MirrorRow(src_y, dst_y, width);
|
||||
src_y += src_stride_y;
|
||||
dst_y += dst_stride_y;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert YUY2 to I422.
|
||||
LIBYUV_API
|
||||
int YUY2ToI422(const uint8_t* src_yuy2,
|
||||
@ -1047,6 +983,68 @@ int YUY2ToY(const uint8_t* src_yuy2,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Mirror a plane of data.
|
||||
// See Also I400Mirror
|
||||
LIBYUV_API
|
||||
void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y,
|
||||
int dst_stride_y, int width, int height) {
|
||||
int y;
|
||||
void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_stride_y = -src_stride_y;
|
||||
}
|
||||
#if defined(HAS_MIRRORROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
MirrorRow = MirrorRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorRow = MirrorRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
MirrorRow = MirrorRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
MirrorRow = MirrorRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
MirrorRow = MirrorRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
MirrorRow = MirrorRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
MirrorRow = MirrorRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
MirrorRow = MirrorRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_MMI)
|
||||
if (TestCpuFlag(kCpuHasMMI)) {
|
||||
MirrorRow = MirrorRow_Any_MMI;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
MirrorRow = MirrorRow_MMI;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Mirror plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
MirrorRow(src_y, dst_y, width);
|
||||
src_y += src_stride_y;
|
||||
dst_y += dst_stride_y;
|
||||
}
|
||||
}
|
||||
|
||||
// Mirror I400 with optional flipping
|
||||
LIBYUV_API
|
||||
int I400Mirror(const uint8_t* src_y,
|
||||
|
||||
@ -21,17 +21,21 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static void ARGBTranspose(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
static int ARGBTranspose(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
int i;
|
||||
int src_pixel_step = src_stride_argb >> 2;
|
||||
void (*ScaleARGBRowDownEven)(
|
||||
const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
|
||||
uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
|
||||
// Check stride is a multiple of 4.
|
||||
if (src_stride_argb & 3) {
|
||||
return -1;
|
||||
}
|
||||
#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
|
||||
@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
|
||||
dst_argb += dst_stride_argb;
|
||||
src_argb += 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ARGBRotate90(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
static int ARGBRotate90(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Rotate by 90 is a ARGBTranspose with the source read
|
||||
// from bottom to top. So set the source pointer to the end
|
||||
// of the buffer and flip the sign of the source stride.
|
||||
src_argb += src_stride_argb * (height - 1);
|
||||
src_stride_argb = -src_stride_argb;
|
||||
ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
|
||||
height);
|
||||
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
}
|
||||
|
||||
void ARGBRotate270(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
static int ARGBRotate270(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Rotate by 270 is a ARGBTranspose with the destination written
|
||||
// from bottom to top. So set the destination pointer to the end
|
||||
// of the buffer and flip the sign of the destination stride.
|
||||
dst_argb += dst_stride_argb * (width - 1);
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
|
||||
height);
|
||||
return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
}
|
||||
|
||||
void ARGBRotate180(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
static int ARGBRotate180(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_stride_argb,
|
||||
int width,
|
||||
int height) {
|
||||
// Swap first and last row and mirror the content. Uses a temporary row.
|
||||
align_buffer_64(row, width * 4);
|
||||
const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
|
||||
@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
|
||||
dst_bot -= dst_stride_argb;
|
||||
}
|
||||
free_aligned_buffer_64(row);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
|
||||
return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate90:
|
||||
ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
|
||||
height);
|
||||
return 0;
|
||||
return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate270:
|
||||
ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
|
||||
height);
|
||||
return 0;
|
||||
return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
case kRotate180:
|
||||
ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
|
||||
height);
|
||||
return 0;
|
||||
return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
|
||||
width, height);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@ -37,7 +37,7 @@ void TransposeWx8_NEON(const uint8_t* src,
|
||||
"sub %w3, %w3, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
@ -48,23 +48,39 @@ void TransposeWx8_NEON(const uint8_t* src,
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"trn2 v16.8b, v0.8b, v1.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"trn1 v17.8b, v0.8b, v1.8b \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v18.8b, v2.8b, v3.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 1
|
||||
"trn1 v19.8b, v2.8b, v3.8b \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v20.8b, v4.8b, v5.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 2
|
||||
"trn1 v21.8b, v4.8b, v5.8b \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v22.8b, v6.8b, v7.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 3
|
||||
"trn1 v23.8b, v6.8b, v7.8b \n"
|
||||
"add %0, %0, %5 \n"
|
||||
|
||||
"trn2 v3.4h, v17.4h, v19.4h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 4
|
||||
"trn1 v1.4h, v17.4h, v19.4h \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v2.4h, v16.4h, v18.4h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 5
|
||||
"trn1 v0.4h, v16.4h, v18.4h \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v7.4h, v21.4h, v23.4h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 6
|
||||
"trn1 v5.4h, v21.4h, v23.4h \n"
|
||||
"add %0, %0, %5 \n"
|
||||
"trn2 v6.4h, v20.4h, v22.4h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // row 7
|
||||
"trn1 v4.4h, v20.4h, v22.4h \n"
|
||||
|
||||
"trn2 v21.2s, v1.2s, v5.2s \n"
|
||||
@ -226,6 +242,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
|
||||
@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
|
||||
|
||||
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
|
||||
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
|
||||
|
||||
@ -1101,8 +1101,11 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"lea 0x40(%0),%0 \n" \
|
||||
"phaddw %%xmm0,%%xmm6 \n" \
|
||||
"phaddw %%xmm2,%%xmm1 \n" \
|
||||
"paddw %%" #round ",%%xmm6 \n" \
|
||||
"paddw %%" #round ",%%xmm1 \n" \
|
||||
"prefetcht0 1280(%0) \n" \
|
||||
"paddw %%" #round \
|
||||
",%%xmm6 \n" \
|
||||
"paddw %%" #round \
|
||||
",%%xmm1 \n" \
|
||||
"psrlw $0x8,%%xmm6 \n" \
|
||||
"psrlw $0x8,%%xmm1 \n" \
|
||||
"packuswb %%xmm1,%%xmm6 \n" \
|
||||
@ -1111,33 +1114,36 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"sub $0x10,%2 \n" \
|
||||
"jg 1b \n"
|
||||
|
||||
#define RGBTOY_AVX2(round) \
|
||||
"1: \n" \
|
||||
"vmovdqu (%0),%%ymm0 \n" \
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n" \
|
||||
"vmovdqu 0x40(%0),%%ymm2 \n" \
|
||||
"vmovdqu 0x60(%0),%%ymm3 \n" \
|
||||
"vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
|
||||
"vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
|
||||
"vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
|
||||
"vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
|
||||
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
|
||||
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
|
||||
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
|
||||
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
|
||||
"lea 0x80(%0),%0 \n" \
|
||||
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
|
||||
"vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
|
||||
"vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
|
||||
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
|
||||
"vmovdqu %%ymm0,(%1) \n" \
|
||||
"lea 0x20(%1),%1 \n" \
|
||||
"sub $0x20,%2 \n" \
|
||||
"jg 1b \n" \
|
||||
#define RGBTOY_AVX2(round) \
|
||||
"1: \n" \
|
||||
"vmovdqu (%0),%%ymm0 \n" \
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n" \
|
||||
"vmovdqu 0x40(%0),%%ymm2 \n" \
|
||||
"vmovdqu 0x60(%0),%%ymm3 \n" \
|
||||
"vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
|
||||
"vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
|
||||
"vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
|
||||
"vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
|
||||
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
|
||||
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
|
||||
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
|
||||
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
|
||||
"lea 0x80(%0),%0 \n" \
|
||||
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
|
||||
"prefetcht0 1280(%0) \n" \
|
||||
"vpaddw %%" #round \
|
||||
",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
|
||||
"vpaddw %%" #round \
|
||||
",%%ymm2,%%ymm2 \n" \
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
|
||||
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
|
||||
"vmovdqu %%ymm0,(%1) \n" \
|
||||
"lea 0x20(%1),%1 \n" \
|
||||
"sub $0x20,%2 \n" \
|
||||
"jg 1b \n" \
|
||||
"vzeroupper \n"
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
@ -1148,15 +1154,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm7)
|
||||
LABELALIGN RGBTOY(xmm7)
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
@ -1168,8 +1174,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm5)
|
||||
LABELALIGN RGBTOY(xmm5)
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
@ -1187,8 +1192,7 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm5)
|
||||
LABELALIGN RGBTOY(xmm5)
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
@ -1210,8 +1214,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqu %6,%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY_AVX2(ymm7)
|
||||
LABELALIGN RGBTOY_AVX2(ymm7)
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
@ -1219,7 +1222,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_AVX2
|
||||
|
||||
@ -1232,8 +1236,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqu %6,%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY_AVX2(ymm7)
|
||||
LABELALIGN RGBTOY_AVX2(ymm7)
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
@ -1241,7 +1244,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_ABGRTOYROW_AVX2
|
||||
|
||||
@ -1253,15 +1257,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vmovdqu %5,%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY_AVX2(ymm5)
|
||||
LABELALIGN RGBTOY_AVX2(ymm5)
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kPermdARGBToY_AVX) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_ARGBTOYJROW_AVX2
|
||||
|
||||
@ -1273,9 +1277,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vmovdqu %5,%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY_AVX2(ymm5)
|
||||
"vzeroupper \n"
|
||||
LABELALIGN RGBTOY_AVX2(
|
||||
ymm5) "vzeroupper \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
@ -1536,7 +1539,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
|
||||
"+r"(dst_v), // %2
|
||||
"+rm"(width) // %3
|
||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||
"m"(kSub128), // %5
|
||||
"m"(kSub128), // %5
|
||||
"m"(kARGBToVJ), // %6
|
||||
"m"(kARGBToUJ), // %7
|
||||
"m"(kShufARGBToUV_AVX) // %8
|
||||
@ -1606,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
|
||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||
"m"(kARGBToVJ), // %5
|
||||
"m"(kARGBToUJ), // %6
|
||||
"m"(kSub128) // %7
|
||||
"m"(kSub128) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
|
||||
}
|
||||
#endif // HAS_ARGBTOUVJROW_SSSE3
|
||||
@ -1675,15 +1678,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm7)
|
||||
LABELALIGN RGBTOY(xmm7)
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kBGRAToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
|
||||
@ -1755,15 +1758,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm7)
|
||||
LABELALIGN RGBTOY(xmm7)
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kABGRToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
@ -1772,15 +1775,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
RGBTOY(xmm7)
|
||||
LABELALIGN RGBTOY(xmm7)
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kRGBAToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
|
||||
|
||||
2665
source/row_mmi.cc
2665
source/row_mmi.cc
File diff suppressed because it is too large
Load Diff
@ -278,7 +278,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
|
||||
// RGB565.
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
@ -315,7 +316,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||
v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
|
||||
// RGB565.
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
@ -401,6 +403,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
"orr v22.8b, v20.8b, v20.8b \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -527,7 +530,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
|
||||
// RGB565.
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_uv), // %1
|
||||
"+r"(dst_rgb565), // %2
|
||||
@ -601,6 +605,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store U
|
||||
"st1 {v1.16b}, [%2], #16 \n" // store V
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -622,6 +627,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load V
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
@ -645,6 +651,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store R
|
||||
"st1 {v1.16b}, [%2], #16 \n" // store G
|
||||
"st1 {v2.16b}, [%3], #16 \n" // store B
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
@ -669,6 +676,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
|
||||
"ld1 {v2.16b}, [%2], #16 \n" // load B
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop
|
||||
"st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
@ -687,6 +695,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
"stp q0, q1, [%1], #32 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -703,6 +712,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
||||
"1: \n"
|
||||
"subs %w1, %w1, #16 \n" // 16 bytes per loop
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(width) // %1
|
||||
@ -716,6 +726,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
||||
"1: \n"
|
||||
"subs %w1, %w1, #4 \n" // 4 ints per loop
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(width) // %1
|
||||
@ -739,6 +750,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"tbl v1.16b, {v1.16b}, v3.16b \n"
|
||||
"tbl v0.16b, {v2.16b}, v3.16b \n"
|
||||
"st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -763,6 +775,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
|
||||
"rev64 v1.8b, v1.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // dst += 8
|
||||
"st1 {v1.8b}, [%2], #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -783,6 +796,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"rev64 v0.4s, v0.4s \n"
|
||||
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
||||
"st1 {v0.D}[0], [%1], #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -800,6 +814,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
||||
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -818,6 +833,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -836,6 +852,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||
"orr v2.8b, v4.8b, v4.8b \n" // move g
|
||||
"orr v1.8b, v5.8b, v5.8b \n" // move r
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_rgba), // %1
|
||||
@ -853,6 +870,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_rgb24), // %1
|
||||
@ -885,6 +903,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb565), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -942,6 +961,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||
ARGB1555TOARGB
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb1555), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -972,7 +992,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb4444), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -989,8 +1009,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
||||
"1: \n"
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
|
||||
// RGB24.
|
||||
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_rgb24), // %1
|
||||
@ -1023,6 +1043,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1038,6 +1059,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1057,6 +1079,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
||||
"st1 {v3.8b}, [%2], #8 \n" // store 8 V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1077,6 +1100,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
||||
"st1 {v2.8b}, [%2], #8 \n" // store 8 V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1102,6 +1126,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
|
||||
"urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
|
||||
"st1 {v1.8b}, [%2], #8 \n" // store 8 U.
|
||||
"st1 {v3.8b}, [%3], #8 \n" // store 8 V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(src_yuy2b), // %1
|
||||
@ -1129,6 +1154,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
||||
"urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 U.
|
||||
"st1 {v2.8b}, [%3], #8 \n" // store 8 V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(src_uyvyb), // %1
|
||||
@ -1153,6 +1179,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 4.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -1175,6 +1202,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
|
||||
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
@ -1198,6 +1226,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
||||
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
@ -1217,6 +1246,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTORGB565
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_rgb565), // %1
|
||||
@ -1238,6 +1268,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
||||
"uqadd v21.8b, v21.8b, v1.8b \n"
|
||||
"uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_rgb) // %0
|
||||
: "r"(src_argb), // %1
|
||||
@ -1256,6 +1287,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
||||
ARGBTOARGB1555
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
|
||||
// ARGB1555.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb1555), // %1
|
||||
@ -1276,6 +1308,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
ARGBTOARGB4444
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
|
||||
// ARGB4444.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb4444), // %1
|
||||
@ -1299,6 +1332,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1316,6 +1350,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||
// pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_a), // %1
|
||||
@ -1338,6 +1373,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1359,6 +1395,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
"umlal v0.8h, v3.8b, v6.8b \n" // R
|
||||
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1399,6 +1436,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
@ -1767,6 +1805,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
"uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb565), // %0
|
||||
"+r"(src_rgb565_1), // %1
|
||||
@ -1832,6 +1871,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb1555), // %0
|
||||
"+r"(src_argb1555_1), // %1
|
||||
@ -1897,6 +1937,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb4444), // %0
|
||||
"+r"(src_argb4444_1), // %1
|
||||
@ -1927,6 +1968,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v27.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb565), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1954,6 +1996,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb1555), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -1980,6 +2023,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v27.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb4444), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2003,6 +2047,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2026,6 +2071,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2049,6 +2095,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2072,6 +2119,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2095,6 +2143,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -2116,6 +2165,7 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
"umlal v0.8h, v2.8b, v6.8b \n" // R
|
||||
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_yj), // %1
|
||||
@ -2135,8 +2185,10 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
"umull v0.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v0.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v0.8h, v2.8b, v6.8b \n" // R
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines ahead
|
||||
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_yj), // %1
|
||||
@ -2174,6 +2226,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
||||
"rshrn v0.8b, v2.8h, #8 \n"
|
||||
"rshrn2 v0.16b, v3.8h, #8 \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
@ -2290,6 +2343,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
||||
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
// pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -2331,6 +2385,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
|
||||
"uqxtn v1.8b, v1.8h \n"
|
||||
"uqxtn v2.8b, v2.8h \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
@ -2369,6 +2424,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
||||
"uqxtn v6.8b, v6.8h \n"
|
||||
"uqxtn v7.8b, v7.8h \n"
|
||||
"st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -2395,6 +2451,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
"orr v1.8b, v0.8b, v0.8b \n" // G
|
||||
"orr v2.8b, v0.8b, v0.8b \n" // R
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -2435,6 +2492,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
|
||||
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
|
||||
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
@ -2495,6 +2553,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
|
||||
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -2525,6 +2584,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
|
||||
"rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
|
||||
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -2550,6 +2610,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n"
|
||||
"uqadd v3.8b, v3.8b, v7.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -2575,6 +2636,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
|
||||
"uqsub v2.8b, v2.8b, v6.8b \n"
|
||||
"uqsub v3.8b, v3.8b, v7.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -2604,6 +2666,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
|
||||
"orr v1.8b, v0.8b, v0.8b \n"
|
||||
"orr v2.8b, v0.8b, v0.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -2626,6 +2689,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"uqadd v0.16b, v0.16b, v1.16b \n" // add
|
||||
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -2653,6 +2717,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v1.8b, v0.8b, v2.8b \n" // add
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
"+r"(src_sobely), // %1
|
||||
@ -2689,6 +2754,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
"abs v0.8h, v0.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y0), // %0
|
||||
"+r"(src_y1), // %1
|
||||
@ -2727,6 +2793,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
"abs v0.8h, v0.8h \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y0), // %0
|
||||
"+r"(src_y1), // %1
|
||||
@ -2754,6 +2821,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
|
||||
"fcvtn v1.4h, v2.4s \n" // 8 half floats
|
||||
"fcvtn2 v1.8h, v3.4s \n"
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2779,6 +2847,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
||||
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
|
||||
"uqshrn2 v1.8h, v3.4s, #13 \n"
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2803,6 +2872,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"fmul v3.4s, v3.4s, %3.s[0] \n"
|
||||
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2828,6 +2898,7 @@ float ScaleMaxSamples_NEON(const float* src,
|
||||
"fmax v5.4s, v5.4s, v1.4s \n" // max
|
||||
"fmax v6.4s, v6.4s, v2.4s \n"
|
||||
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
"fmax v5.4s, v5.4s, v6.4s \n" // max
|
||||
"fmaxv %s3, v5.4s \n" // signed max acculator
|
||||
@ -2857,6 +2928,7 @@ float ScaleSumSamples_NEON(const float* src,
|
||||
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
|
||||
"fmla v6.4s, v2.4s, v2.4s \n"
|
||||
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
"faddp v5.4s, v5.4s, v6.4s \n"
|
||||
"faddp v5.4s, v5.4s, v5.4s \n"
|
||||
@ -2878,6 +2950,7 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -2902,18 +2975,23 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.8h}, [%4], #16 \n"
|
||||
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
|
||||
"ld1 {v2.8h}, [%1], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"ld1 {v2.8h}, [%2], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v7.4h \n" // * 6
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
|
||||
"ld1 {v2.8h}, [%3], #16 \n"
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"prfm pldl1keep, [%4, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src0), // %0
|
||||
"+r"(src1), // %1
|
||||
@ -2946,6 +3024,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
"ld1 {v4.4s,v5.4s}, [%3], #32 \n"
|
||||
"add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
|
||||
"add v3.4s, v3.4s, v5.4s \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v1.4s, v3.4s, v6.4s \n" // * 4
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
@ -2982,14 +3061,19 @@ void GaussCol_F32_NEON(const float* src0,
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
|
||||
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
|
||||
"fmla v1.4s, v5.4s, v7.4s \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"ld1 {v4.4s, v5.4s}, [%4], #32 \n"
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"fadd v0.4s, v0.4s, v4.4s \n" // * 1
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"fadd v1.4s, v1.4s, v5.4s \n"
|
||||
"prfm pldl1keep, [%4, 448] \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
@ -3024,6 +3108,7 @@ void GaussRow_F32_NEON(const float* src,
|
||||
"fadd v3.4s, v3.4s, v5.4s \n"
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
|
||||
"fmul v1.4s, v1.4s, v8.4s \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
@ -3052,6 +3137,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
@ -3079,6 +3165,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"uqrshrn v2.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
"+r"(src_ayuv_1), // %1
|
||||
@ -3107,6 +3194,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
"+r"(src_ayuv_1), // %1
|
||||
@ -3124,6 +3212,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
// pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop
|
||||
"st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -3140,6 +3229,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop
|
||||
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_vu), // %1
|
||||
|
||||
@ -31,6 +31,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
||||
// load even pixels into v0, odd into v1
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store odd pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
@ -54,6 +55,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
|
||||
"st1 {v0.16b}, [%1], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -82,6 +84,8 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
"rshrn v0.8b, v0.8h, #2 \n" // round and pack
|
||||
"rshrn2 v0.16b, v1.8h, #2 \n"
|
||||
"st1 {v0.16b}, [%2], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -102,6 +106,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
@ -131,6 +136,10 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
||||
"addp v0.8h, v0.8h, v0.8h \n"
|
||||
"rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
|
||||
"st1 {v0.s}[0], [%1], #4 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"prfm pldl1keep, [%4, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
@ -156,7 +165,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
||||
"subs %w2, %w2, #24 \n"
|
||||
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
|
||||
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
@ -211,7 +221,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
||||
|
||||
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
|
||||
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
@ -252,7 +264,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
||||
"uqrshrn v2.8b, v4.8h, #2 \n"
|
||||
|
||||
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
@ -286,7 +300,8 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
||||
"tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
|
||||
"st1 {v2.8b}, [%1], #8 \n"
|
||||
"st1 {v2.s}[2], [%1], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
@ -400,7 +415,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
|
||||
|
||||
"st1 {v3.8b}, [%1], #8 \n"
|
||||
"st1 {v3.s}[2], [%1], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(tmp_src_stride), // %2
|
||||
@ -504,7 +522,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
|
||||
"st1 {v3.8b}, [%1], #8 \n"
|
||||
"st1 {v3.s}[2], [%1], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(tmp_src_stride), // %2
|
||||
@ -528,7 +548,8 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
"uaddw v1.8h, v1.8h, v0.8b \n"
|
||||
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
@ -599,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
|
||||
"add v1.4s, v1.4s, v0.4s \n"
|
||||
"add v2.4s, v2.4s, v0.4s \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"b.gt 1b \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
@ -647,6 +668,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
||||
"rshrn v0.8b, v6.8h, #8 \n"
|
||||
"rshrn2 v0.16b, v7.8h, #8 \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
@ -658,6 +681,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"b.gt 25b \n"
|
||||
"b 99f \n"
|
||||
|
||||
@ -668,6 +693,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"b.gt 50b \n"
|
||||
"b 99f \n"
|
||||
|
||||
@ -679,6 +706,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"b.gt 75b \n"
|
||||
"b 99f \n"
|
||||
|
||||
@ -687,6 +716,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
@ -713,6 +743,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"mov v2.16b, v3.16b \n"
|
||||
"st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -736,6 +767,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
|
||||
"urhadd v1.16b, v2.16b, v3.16b \n"
|
||||
"st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -769,6 +801,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
"rshrn v2.8b, v2.8h, #2 \n"
|
||||
"rshrn v3.8b, v3.8h, #2 \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -794,6 +828,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
||||
"ld1 {v0.s}[3], [%0], %3 \n"
|
||||
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
|
||||
"st1 {v0.16b}, [%1], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -838,6 +873,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
||||
"rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
|
||||
"subs %w3, %w3, #4 \n" // 4 pixels per loop.
|
||||
"st1 {v0.16b}, [%2], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -878,6 +915,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
|
||||
// clang-format on
|
||||
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
@ -949,7 +987,8 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
|
||||
"st1 {v0.4s}, [%0], #16 \n" // store pixels
|
||||
"add v5.4s, v5.4s, v6.4s \n"
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
@ -984,6 +1023,8 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
||||
"rshrn v0.4h, v0.4s, #2 \n" // round and pack
|
||||
"rshrn2 v0.8h, v1.4s, #2 \n"
|
||||
"st1 {v0.8h}, [%2], #16 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -1032,6 +1073,8 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
|
||||
"uqrshrn v17.4h, v18.4s, #4 \n"
|
||||
"uqrshrn2 v17.8h, v4.4s, #4 \n"
|
||||
"st2 {v16.8h-v17.8h}, [%2], #32 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_stride), // %1
|
||||
|
||||
@ -804,6 +804,23 @@ TEST_F(LibYUVPlanarTest, TestARGBMirror) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
|
||||
SIMD_ALIGNED(uint8_t orig_pixels[1280]);
|
||||
SIMD_ALIGNED(uint8_t dst_pixels[1280]);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
}
|
||||
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
|
||||
}
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestShade) {
|
||||
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
|
||||
SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
|
||||
@ -3315,8 +3332,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
}
|
||||
#else
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
|
||||
1280);
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -3369,36 +3386,24 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
|
||||
for (int i = 0; i < 1280 * 5; ++i) {
|
||||
orig_pixels[i] = static_cast<float>(i);
|
||||
}
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_c[0], 1280);
|
||||
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussCol_F32_NEON(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
@ -3455,18 +3460,18 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
|
||||
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
|
||||
(float*)(dst_pixels_c), benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
(float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
|
||||
benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
|
||||
(float*)(dst_pixels_opt), benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
(float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
|
||||
benchmark_height_);
|
||||
}
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
|
||||
EXPECT_NEAR(((float*)(dst_pixels_c)) [i],
|
||||
((float*)(dst_pixels_opt))[i], 1.f) << i;
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
|
||||
EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
|
||||
<< i;
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
|
||||
@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
|
||||
benchmark_cpu_info_);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
|
||||
int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
|
||||
|
||||
align_buffer_page_end(src_argb, argb_plane_size);
|
||||
align_buffer_page_end(dst_argb, argb_plane_size);
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
|
||||
benchmark_width_ * 4, benchmark_width_,
|
||||
benchmark_height_, kRotate0));
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
|
||||
benchmark_width_ * 4 - 1, benchmark_width_ - 1,
|
||||
benchmark_height_, kRotate0));
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
|
||||
benchmark_width_ * 4, benchmark_width_,
|
||||
benchmark_height_, kRotate180));
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
|
||||
benchmark_width_ * 4 - 1, benchmark_width_ - 1,
|
||||
benchmark_height_, kRotate180));
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
|
||||
abs(benchmark_height_) * 4, benchmark_width_,
|
||||
benchmark_height_, kRotate90));
|
||||
|
||||
EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
|
||||
abs(benchmark_height_) * 4, benchmark_width_ - 1,
|
||||
benchmark_height_, kRotate90));
|
||||
|
||||
EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
|
||||
abs(benchmark_height_) * 4, benchmark_width_,
|
||||
benchmark_height_, kRotate270));
|
||||
|
||||
EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
|
||||
abs(benchmark_height_) * 4, benchmark_width_ - 1,
|
||||
benchmark_height_, kRotate270));
|
||||
|
||||
free_aligned_buffer_page_end(dst_argb);
|
||||
free_aligned_buffer_page_end(src_argb);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user