J400ToARGB optimized for Exynos using ZIP+ST1

Bug: 204562143
Change-Id: I56c98198c02bd0dd1283f1c14837730c92832c39
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3328702
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2021-12-09 13:44:17 -08:00 committed by libyuv LUCI CQ
parent c5d48a11f9
commit d7a2d5da87
6 changed files with 299 additions and 153 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1805
Version: 1806
License: BSD
License File: LICENSE

View File

@ -320,7 +320,6 @@ extern "C" {
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1
// https://code.google.com/p/libyuv/issues/detail?id=517

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1805
#define LIBYUV_VERSION 1806
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -545,21 +545,21 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
#undef ANY31PT
// Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
} \
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
SS(r, UVSHIFT) * SBPP2); \
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
}
// Merge functions.

View File

@ -8894,10 +8894,10 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
static const uvec8 kYUV24Shuffle[3] =
{{ 8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12 },
{ 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15 },
{ 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7 }};
static const uvec8 kYUV24Shuffle[3] = {
{8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
{9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
{2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
// Convert biplanar NV21 to packed YUV24
// NV21 has VU in memory for chroma.
@ -8929,10 +8929,10 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
"lea 48(%2),%2 \n"
"sub $16,%3 \n" // 16 pixels per loop
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "r"(&kYUV24Shuffle[0]) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
@ -8945,79 +8945,78 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
uint8_t* dst_yuv24,
int width) {
asm volatile(
"sub %0,%1 \n"
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
"vbroadcastf128 16(%4),%%ymm5 \n"
"vbroadcastf128 32(%4),%%ymm6 \n"
"sub %0,%1 \n"
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
"vbroadcastf128 16(%4),%%ymm5 \n"
"vbroadcastf128 32(%4),%%ymm6 \n"
"1: \n"
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
"lea 32(%0),%0 \n"
"vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
"vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
"vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
"vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm3,(%2) \n"
"vmovdqu %%ymm0,32(%2) \n"
"vmovdqu %%ymm1,64(%2) \n"
"lea 96(%2),%2 \n"
"sub $32,%3 \n" // 32 pixels per loop
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
"1: \n"
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
"lea 32(%0),%0 \n"
"vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
"vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
"vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
"vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
"vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm3,(%2) \n"
"vmovdqu %%ymm0,32(%2) \n"
"vmovdqu %%ymm1,64(%2) \n"
"lea 96(%2),%2 \n"
"sub $32,%3 \n" // 32 pixels per loop
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "r"(&kYUV24Shuffle[0]) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_NV21ToYUV24ROW_AVX512
// The following VMBI VEX256 code tests okay with the intelsde emulator.
static const lvec8 kYUV24Perm[3] =
{{ 32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43 },
{ 10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52 },
{ 53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31 }};
static const lvec8 kYUV24Perm[3] = {
{32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
{10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
{53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
int width) {
asm volatile(
"sub %0,%1 \n"
"vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
"vmovdqa 32(%4),%%ymm5 \n"
"vmovdqa 64(%4),%%ymm6 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
"lea 32(%0),%0 \n"
"vmovdqa %%ymm2, %%ymm0 \n"
"vmovdqa %%ymm2, %%ymm1 \n"
"vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
"vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
"vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
"vmovdqu %%ymm0,(%2) \n"
"vmovdqu %%ymm1,32(%2) \n"
"vmovdqu %%ymm2,64(%2) \n"
"lea 96(%2),%2 \n"
"sub $32,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
"sub %0,%1 \n"
"vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
"vmovdqa 32(%4),%%ymm5 \n"
"vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
"lea 32(%0),%0 \n"
"vmovdqa %%ymm2, %%ymm0 \n"
"vmovdqa %%ymm2, %%ymm1 \n"
"vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
"vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
"vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
"vmovdqu %%ymm0,(%2) \n"
"vmovdqu %%ymm1,32(%2) \n"
"vmovdqu %%ymm2,64(%2) \n"
"lea 96(%2),%2 \n"
"sub $32,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "r"(&kYUV24Perm[0]) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}

View File

@ -15,7 +15,8 @@ namespace libyuv {
extern "C" {
#endif
// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them.
// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
// STn over ZIP1+ST1
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
// This module is for GCC Neon armv8 64 bit.
@ -385,6 +386,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19");
}
#if LIBYUV_USE_ST4
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
"movi v23.8b, #255 \n"
@ -402,6 +404,27 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
:
: "cc", "memory", "v20", "v21", "v22", "v23");
}
#else
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile(
"movi v20.8b, #255 \n"
"1: \n"
"ldr d16, [%0], #8 \n"
"subs %w2, %w2, #8 \n"
"zip1 v18.16b, v16.16b, v16.16b \n" // YY
"zip1 v19.16b, v16.16b, v20.16b \n" // YA
"prfm pldl1keep, [%0, 448] \n"
"zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
"zip2 v17.16b, v18.16b, v19.16b \n"
"stp q16, q17, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
:
: "cc", "memory", "v16", "v17", "v18", "v19", "v20");
}
#endif // LIBYUV_USE_ST4
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
@ -581,6 +604,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
);
}
#if LIBYUV_USE_ST2
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
@ -604,6 +628,86 @@ void MergeUVRow_NEON(const uint8_t* src_u,
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
asm volatile(
"dup v2.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"ushl v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2");
}
#else
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load U
"ld1 {v1.16b}, [%1], #16 \n" // load V
"subs %w3, %w3, #16 \n" // 16 processed per loop
"zip1 v2.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"zip2 v3.16b, v0.16b, v1.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
asm volatile(
"dup v4.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"ushl v0.8h, v0.8h, v4.8h \n"
"ushl v1.8h, v1.8h, v4.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"zip1 v2.8h, v0.8h, v1.8h \n"
"zip2 v3.8h, v0.8h, v1.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
}
#endif // LIBYUV_USE_ST2
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_r,
@ -684,6 +788,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
);
}
#if LIBYUV_USE_ST4
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
@ -693,9 +798,9 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
int width) {
asm volatile(
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v3.16b}, [%3], #16 \n" // load A
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 448] \n"
@ -714,6 +819,47 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
#else
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%2], #16 \n" // load B
"ld1 {v1.16b}, [%1], #16 \n" // load G
"ld1 {v2.16b}, [%0], #16 \n" // load R
"ld1 {v3.16b}, [%3], #16 \n" // load A
"subs %w5, %w5, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%2, 448] \n"
"zip1 v4.16b, v0.16b, v1.16b \n" // BG
"zip1 v5.16b, v2.16b, v3.16b \n" // RA
"prfm pldl1keep, [%1, 448] \n"
"zip2 v6.16b, v0.16b, v1.16b \n" // BG
"zip2 v7.16b, v2.16b, v3.16b \n" // RA
"prfm pldl1keep, [%0, 448] \n"
"zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
"zip2 v1.8h, v4.8h, v5.8h \n"
"prfm pldl1keep, [%3, 448] \n"
"zip1 v2.8h, v6.8h, v7.8h \n"
"zip2 v3.8h, v6.8h, v7.8h \n"
"st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
"b.gt 1b \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
"+r"(src_a), // %3
"+r"(dst_argb), // %4
"+r"(width) // %5
: // Input registers
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List
);
}
#endif // LIBYUV_USE_ST4
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
@ -1706,28 +1852,6 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
:
: "cc", "memory", "v0", "v1", "v2", "v3");
}
#else
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
int width) {
asm volatile(
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"zip1 v2.16b, v0.16b, v0.16b \n"
"zip2 v3.16b, v0.16b, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"zip1 v4.16b, v1.16b, v1.16b \n"
"zip2 v5.16b, v1.16b, v1.16b \n"
"st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
#endif // LIBYUV_USE_ST2
static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
10, 9, 8, 11, 14, 13, 12, 15};
@ -1754,6 +1878,54 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
: "r"(&kShuffleARGBToABGR) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
#else
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
int width) {
asm volatile(
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"zip1 v2.16b, v0.16b, v0.16b \n"
"zip2 v3.16b, v0.16b, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"zip1 v4.16b, v1.16b, v1.16b \n"
"zip2 v5.16b, v1.16b, v1.16b \n"
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ar64), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
static const uvec8 kShuffleARGBToAB64[2] = {
{2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
{10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64,
int width) {
asm volatile(
"ldp q6, q7, [%3] \n" // 2 shufflers
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
"tbl v3.16b, {v0.16b}, v7.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v4.16b, {v1.16b}, v6.16b \n"
"tbl v5.16b, {v1.16b}, v7.16b \n"
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_ab64), // %1
"+r"(width) // %2
: "r"(&kShuffleARGBToAB64[0]) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
#endif // LIBYUV_USE_ST2
static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
17, 19, 21, 23, 25, 27, 29, 31};
@ -3720,10 +3892,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
: "cc", "memory", "v0", "v1", "v2");
}
#else
static const uvec8 kYUV24Shuffle[3] =
{{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 },
{ 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 },
{ 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }};
static const uvec8 kYUV24Shuffle[3] = {
{16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
{21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
{10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
// Convert biplanar NV21 to packed YUV24
// NV21 has VU in memory for chroma.
@ -3733,27 +3905,29 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
uint8_t* dst_yuv24,
int width) {
asm volatile(
"ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants
"ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
"prfm pldl1keep, [%0, 448] \n"
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n"
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_yuv24), // %2
"+r"(width) // %3
: "r"(&kYUV24Shuffle[0]) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
}
#endif // LIBYUV_USE_ST3
// Note ST2 8b version is faster than zip+ST1
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
@ -3915,32 +4089,6 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
: "cc", "memory", "v0", "v1", "v2");
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,
uint16_t* dst_uv,
int depth,
int width) {
int shift = 16 - depth;
asm volatile(
"dup v2.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"ushl v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "v0", "v1", "v2");
}
void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,