mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
J400ToARGB optimized for Exynos using ZIP+ST1
Bug: 204562143 Change-Id: I56c98198c02bd0dd1283f1c14837730c92832c39 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3328702 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
c5d48a11f9
commit
d7a2d5da87
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1805
|
Version: 1806
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -320,7 +320,6 @@ extern "C" {
|
|||||||
#define HAS_SPLITRGBROW_SSSE3
|
#define HAS_SPLITRGBROW_SSSE3
|
||||||
#define HAS_SWAPUVROW_SSSE3
|
#define HAS_SWAPUVROW_SSSE3
|
||||||
|
|
||||||
|
|
||||||
#if defined(__x86_64__) || !defined(__pic__)
|
#if defined(__x86_64__) || !defined(__pic__)
|
||||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||||
// https://code.google.com/p/libyuv/issues/detail?id=517
|
// https://code.google.com/p/libyuv/issues/detail?id=517
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1805
|
#define LIBYUV_VERSION 1806
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
@ -545,21 +545,21 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
|
|||||||
#undef ANY31PT
|
#undef ANY31PT
|
||||||
|
|
||||||
// Any 2 planes to 1.
|
// Any 2 planes to 1.
|
||||||
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
||||||
int width) { \
|
int width) { \
|
||||||
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
|
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
|
||||||
memset(temp, 0, 128 * 2); /* for msan */ \
|
memset(temp, 0, 128 * 2); /* for msan */ \
|
||||||
int r = width & MASK; \
|
int r = width & MASK; \
|
||||||
int n = width & ~MASK; \
|
int n = width & ~MASK; \
|
||||||
if (n > 0) { \
|
if (n > 0) { \
|
||||||
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
|
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
|
||||||
} \
|
} \
|
||||||
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
||||||
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
||||||
SS(r, UVSHIFT) * SBPP2); \
|
SS(r, UVSHIFT) * SBPP2); \
|
||||||
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
|
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
|
||||||
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
|
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merge functions.
|
// Merge functions.
|
||||||
|
|||||||
@ -8894,10 +8894,10 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||||
|
|
||||||
static const uvec8 kYUV24Shuffle[3] =
|
static const uvec8 kYUV24Shuffle[3] = {
|
||||||
{{ 8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12 },
|
{8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12},
|
||||||
{ 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15 },
|
{9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15},
|
||||||
{ 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7 }};
|
{2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}};
|
||||||
|
|
||||||
// Convert biplanar NV21 to packed YUV24
|
// Convert biplanar NV21 to packed YUV24
|
||||||
// NV21 has VU in memory for chroma.
|
// NV21 has VU in memory for chroma.
|
||||||
@ -8929,10 +8929,10 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
|
|||||||
"lea 48(%2),%2 \n"
|
"lea 48(%2),%2 \n"
|
||||||
"sub $16,%3 \n" // 16 pixels per loop
|
"sub $16,%3 \n" // 16 pixels per loop
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(src_vu), // %1
|
"+r"(src_vu), // %1
|
||||||
"+r"(dst_yuv24), // %2
|
"+r"(dst_yuv24), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
: "r"(&kYUV24Shuffle[0]) // %4
|
: "r"(&kYUV24Shuffle[0]) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||||
}
|
}
|
||||||
@ -8945,79 +8945,78 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
|
|||||||
uint8_t* dst_yuv24,
|
uint8_t* dst_yuv24,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
|
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
|
||||||
"vbroadcastf128 16(%4),%%ymm5 \n"
|
"vbroadcastf128 16(%4),%%ymm5 \n"
|
||||||
"vbroadcastf128 32(%4),%%ymm6 \n"
|
"vbroadcastf128 32(%4),%%ymm6 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
||||||
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
||||||
"lea 32(%0),%0 \n"
|
"lea 32(%0),%0 \n"
|
||||||
"vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
|
"vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
|
||||||
"vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
|
"vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
|
||||||
"vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
|
"vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
|
||||||
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
|
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
|
||||||
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
|
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
|
||||||
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
|
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
|
||||||
"vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
|
"vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
|
||||||
"vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
|
"vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
|
||||||
"vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
|
"vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
|
||||||
"vmovdqu %%ymm3,(%2) \n"
|
"vmovdqu %%ymm3,(%2) \n"
|
||||||
"vmovdqu %%ymm0,32(%2) \n"
|
"vmovdqu %%ymm0,32(%2) \n"
|
||||||
"vmovdqu %%ymm1,64(%2) \n"
|
"vmovdqu %%ymm1,64(%2) \n"
|
||||||
"lea 96(%2),%2 \n"
|
"lea 96(%2),%2 \n"
|
||||||
"sub $32,%3 \n" // 32 pixels per loop
|
"sub $32,%3 \n" // 32 pixels per loop
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(src_vu), // %1
|
"+r"(src_vu), // %1
|
||||||
"+r"(dst_yuv24), // %2
|
"+r"(dst_yuv24), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
: "r"(&kYUV24Shuffle[0]) // %4
|
: "r"(&kYUV24Shuffle[0]) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_NV21ToYUV24ROW_AVX512
|
#ifdef HAS_NV21ToYUV24ROW_AVX512
|
||||||
// The following VMBI VEX256 code tests okay with the intelsde emulator.
|
// The following VMBI VEX256 code tests okay with the intelsde emulator.
|
||||||
static const lvec8 kYUV24Perm[3] =
|
static const lvec8 kYUV24Perm[3] = {
|
||||||
{{ 32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
|
{32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
|
||||||
37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43 },
|
37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43},
|
||||||
{ 10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
|
{10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
|
||||||
48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52 },
|
48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52},
|
||||||
{ 53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
|
{53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
|
||||||
26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31 }};
|
26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}};
|
||||||
|
|
||||||
void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
|
void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
|
||||||
const uint8_t* src_vu,
|
const uint8_t* src_vu,
|
||||||
uint8_t* dst_yuv24,
|
uint8_t* dst_yuv24,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
"vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
|
"vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
|
||||||
"vmovdqa 32(%4),%%ymm5 \n"
|
"vmovdqa 32(%4),%%ymm5 \n"
|
||||||
"vmovdqa 64(%4),%%ymm6 \n"
|
"vmovdqa 64(%4),%%ymm6 \n" LABELALIGN
|
||||||
LABELALIGN
|
"1: \n"
|
||||||
"1: \n"
|
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
||||||
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
||||||
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
"lea 32(%0),%0 \n"
|
||||||
"lea 32(%0),%0 \n"
|
"vmovdqa %%ymm2, %%ymm0 \n"
|
||||||
"vmovdqa %%ymm2, %%ymm0 \n"
|
"vmovdqa %%ymm2, %%ymm1 \n"
|
||||||
"vmovdqa %%ymm2, %%ymm1 \n"
|
"vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
|
||||||
"vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
|
"vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
|
||||||
"vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
|
"vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
|
||||||
"vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
|
"vmovdqu %%ymm0,(%2) \n"
|
||||||
"vmovdqu %%ymm0,(%2) \n"
|
"vmovdqu %%ymm1,32(%2) \n"
|
||||||
"vmovdqu %%ymm1,32(%2) \n"
|
"vmovdqu %%ymm2,64(%2) \n"
|
||||||
"vmovdqu %%ymm2,64(%2) \n"
|
"lea 96(%2),%2 \n"
|
||||||
"lea 96(%2),%2 \n"
|
"sub $32,%3 \n"
|
||||||
"sub $32,%3 \n"
|
"jg 1b \n"
|
||||||
"jg 1b \n"
|
"vzeroupper \n"
|
||||||
"vzeroupper \n"
|
: "+r"(src_y), // %0
|
||||||
: "+r"(src_y), // %0
|
"+r"(src_vu), // %1
|
||||||
"+r"(src_vu), // %1
|
"+r"(dst_yuv24), // %2
|
||||||
"+r"(dst_yuv24), // %2
|
"+r"(width) // %3
|
||||||
"+r"(width) // %3
|
|
||||||
: "r"(&kYUV24Perm[0]) // %4
|
: "r"(&kYUV24Perm[0]) // %4
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,7 +15,8 @@ namespace libyuv {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them.
|
// Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
|
||||||
|
// STn over ZIP1+ST1
|
||||||
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
|
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
|
||||||
|
|
||||||
// This module is for GCC Neon armv8 64 bit.
|
// This module is for GCC Neon armv8 64 bit.
|
||||||
@ -385,6 +386,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
: "cc", "memory", YUVTORGB_REGS, "v19");
|
: "cc", "memory", YUVTORGB_REGS, "v19");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if LIBYUV_USE_ST4
|
||||||
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v23.8b, #255 \n"
|
"movi v23.8b, #255 \n"
|
||||||
@ -402,6 +404,27 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
|||||||
:
|
:
|
||||||
: "cc", "memory", "v20", "v21", "v22", "v23");
|
: "cc", "memory", "v20", "v21", "v22", "v23");
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||||
|
asm volatile(
|
||||||
|
"movi v20.8b, #255 \n"
|
||||||
|
"1: \n"
|
||||||
|
"ldr d16, [%0], #8 \n"
|
||||||
|
"subs %w2, %w2, #8 \n"
|
||||||
|
"zip1 v18.16b, v16.16b, v16.16b \n" // YY
|
||||||
|
"zip1 v19.16b, v16.16b, v20.16b \n" // YA
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
|
||||||
|
"zip2 v17.16b, v18.16b, v19.16b \n"
|
||||||
|
"stp q16, q17, [%1], #32 \n"
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_y), // %0
|
||||||
|
"+r"(dst_argb), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v16", "v17", "v18", "v19", "v20");
|
||||||
|
}
|
||||||
|
#endif // LIBYUV_USE_ST4
|
||||||
|
|
||||||
void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
||||||
const uint8_t* src_uv,
|
const uint8_t* src_uv,
|
||||||
@ -581,6 +604,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if LIBYUV_USE_ST2
|
||||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||||
void MergeUVRow_NEON(const uint8_t* src_u,
|
void MergeUVRow_NEON(const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
@ -604,6 +628,86 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||||
|
const uint16_t* src_v,
|
||||||
|
uint16_t* dst_uv,
|
||||||
|
int depth,
|
||||||
|
int width) {
|
||||||
|
int shift = 16 - depth;
|
||||||
|
asm volatile(
|
||||||
|
"dup v2.8h, %w4 \n"
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
|
||||||
|
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||||
|
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
|
||||||
|
"ushl v0.8h, v0.8h, v2.8h \n"
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"ushl v1.8h, v1.8h, v2.8h \n"
|
||||||
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
|
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_u), // %0
|
||||||
|
"+r"(src_v), // %1
|
||||||
|
"+r"(dst_uv), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "r"(shift) // %4
|
||||||
|
: "cc", "memory", "v0", "v1", "v2");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||||
|
void MergeUVRow_NEON(const uint8_t* src_u,
|
||||||
|
const uint8_t* src_v,
|
||||||
|
uint8_t* dst_uv,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.16b}, [%0], #16 \n" // load U
|
||||||
|
"ld1 {v1.16b}, [%1], #16 \n" // load V
|
||||||
|
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||||
|
"zip1 v2.16b, v0.16b, v1.16b \n"
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"zip2 v3.16b, v0.16b, v1.16b \n"
|
||||||
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
|
"st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_u), // %0
|
||||||
|
"+r"(src_v), // %1
|
||||||
|
"+r"(dst_uv), // %2
|
||||||
|
"+r"(width) // %3 // Output registers
|
||||||
|
: // Input registers
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||||
|
const uint16_t* src_v,
|
||||||
|
uint16_t* dst_uv,
|
||||||
|
int depth,
|
||||||
|
int width) {
|
||||||
|
int shift = 16 - depth;
|
||||||
|
asm volatile(
|
||||||
|
"dup v4.8h, %w4 \n"
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
|
||||||
|
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||||
|
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
|
||||||
|
"ushl v0.8h, v0.8h, v4.8h \n"
|
||||||
|
"ushl v1.8h, v1.8h, v4.8h \n"
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"zip1 v2.8h, v0.8h, v1.8h \n"
|
||||||
|
"zip2 v3.8h, v0.8h, v1.8h \n"
|
||||||
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
|
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_u), // %0
|
||||||
|
"+r"(src_v), // %1
|
||||||
|
"+r"(dst_uv), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "r"(shift) // %4
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
|
||||||
|
}
|
||||||
|
#endif // LIBYUV_USE_ST2
|
||||||
|
|
||||||
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
|
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
|
||||||
void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
||||||
uint8_t* dst_r,
|
uint8_t* dst_r,
|
||||||
@ -684,6 +788,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if LIBYUV_USE_ST4
|
||||||
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
|
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
|
||||||
void MergeARGBRow_NEON(const uint8_t* src_r,
|
void MergeARGBRow_NEON(const uint8_t* src_r,
|
||||||
const uint8_t* src_g,
|
const uint8_t* src_g,
|
||||||
@ -693,9 +798,9 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
|
|||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v2.16b}, [%0], #16 \n" // load R
|
|
||||||
"ld1 {v1.16b}, [%1], #16 \n" // load G
|
|
||||||
"ld1 {v0.16b}, [%2], #16 \n" // load B
|
"ld1 {v0.16b}, [%2], #16 \n" // load B
|
||||||
|
"ld1 {v1.16b}, [%1], #16 \n" // load G
|
||||||
|
"ld1 {v2.16b}, [%0], #16 \n" // load R
|
||||||
"ld1 {v3.16b}, [%3], #16 \n" // load A
|
"ld1 {v3.16b}, [%3], #16 \n" // load A
|
||||||
"subs %w5, %w5, #16 \n" // 16 processed per loop
|
"subs %w5, %w5, #16 \n" // 16 processed per loop
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
@ -714,6 +819,47 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
|
|||||||
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
// Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
|
||||||
|
void MergeARGBRow_NEON(const uint8_t* src_r,
|
||||||
|
const uint8_t* src_g,
|
||||||
|
const uint8_t* src_b,
|
||||||
|
const uint8_t* src_a,
|
||||||
|
uint8_t* dst_argb,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.16b}, [%2], #16 \n" // load B
|
||||||
|
"ld1 {v1.16b}, [%1], #16 \n" // load G
|
||||||
|
"ld1 {v2.16b}, [%0], #16 \n" // load R
|
||||||
|
"ld1 {v3.16b}, [%3], #16 \n" // load A
|
||||||
|
"subs %w5, %w5, #16 \n" // 16 processed per loop
|
||||||
|
"prfm pldl1keep, [%2, 448] \n"
|
||||||
|
"zip1 v4.16b, v0.16b, v1.16b \n" // BG
|
||||||
|
"zip1 v5.16b, v2.16b, v3.16b \n" // RA
|
||||||
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
|
"zip2 v6.16b, v0.16b, v1.16b \n" // BG
|
||||||
|
"zip2 v7.16b, v2.16b, v3.16b \n" // RA
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
|
||||||
|
"zip2 v1.8h, v4.8h, v5.8h \n"
|
||||||
|
"prfm pldl1keep, [%3, 448] \n"
|
||||||
|
"zip1 v2.8h, v6.8h, v7.8h \n"
|
||||||
|
"zip2 v3.8h, v6.8h, v7.8h \n"
|
||||||
|
"st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_r), // %0
|
||||||
|
"+r"(src_g), // %1
|
||||||
|
"+r"(src_b), // %2
|
||||||
|
"+r"(src_a), // %3
|
||||||
|
"+r"(dst_argb), // %4
|
||||||
|
"+r"(width) // %5
|
||||||
|
: // Input registers
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
|
||||||
|
"v7" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif // LIBYUV_USE_ST4
|
||||||
|
|
||||||
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
|
// Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
|
||||||
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
|
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
|
||||||
@ -1706,28 +1852,6 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
|||||||
:
|
:
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
|
||||||
uint16_t* dst_ar64,
|
|
||||||
int width) {
|
|
||||||
asm volatile(
|
|
||||||
"1: \n"
|
|
||||||
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"zip1 v2.16b, v0.16b, v0.16b \n"
|
|
||||||
"zip2 v3.16b, v0.16b, v0.16b \n"
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"zip1 v4.16b, v1.16b, v1.16b \n"
|
|
||||||
"zip2 v5.16b, v1.16b, v1.16b \n"
|
|
||||||
"st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
|
||||||
"+r"(dst_ar64), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
|
||||||
}
|
|
||||||
#endif // LIBYUV_USE_ST2
|
|
||||||
|
|
||||||
static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
|
static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
|
||||||
10, 9, 8, 11, 14, 13, 12, 15};
|
10, 9, 8, 11, 14, 13, 12, 15};
|
||||||
@ -1754,6 +1878,54 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
|||||||
: "r"(&kShuffleARGBToABGR) // %3
|
: "r"(&kShuffleARGBToABGR) // %3
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint16_t* dst_ar64,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
|
||||||
|
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||||
|
"zip1 v2.16b, v0.16b, v0.16b \n"
|
||||||
|
"zip2 v3.16b, v0.16b, v0.16b \n"
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"zip1 v4.16b, v1.16b, v1.16b \n"
|
||||||
|
"zip2 v5.16b, v1.16b, v1.16b \n"
|
||||||
|
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_argb), // %0
|
||||||
|
"+r"(dst_ar64), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||||
|
}
|
||||||
|
|
||||||
|
static const uvec8 kShuffleARGBToAB64[2] = {
|
||||||
|
{2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
|
||||||
|
{10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
|
||||||
|
|
||||||
|
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||||
|
uint16_t* dst_ab64,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"ldp q6, q7, [%3] \n" // 2 shufflers
|
||||||
|
"1: \n"
|
||||||
|
"ldp q0, q1, [%0], #32 \n" // load 8 pixels
|
||||||
|
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||||
|
"tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
|
||||||
|
"tbl v3.16b, {v0.16b}, v7.16b \n"
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"tbl v4.16b, {v1.16b}, v6.16b \n"
|
||||||
|
"tbl v5.16b, {v1.16b}, v7.16b \n"
|
||||||
|
"st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_argb), // %0
|
||||||
|
"+r"(dst_ab64), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(&kShuffleARGBToAB64[0]) // %3
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||||
|
}
|
||||||
|
#endif // LIBYUV_USE_ST2
|
||||||
|
|
||||||
static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
|
static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
|
||||||
17, 19, 21, 23, 25, 27, 29, 31};
|
17, 19, 21, 23, 25, 27, 29, 31};
|
||||||
@ -3720,10 +3892,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
|||||||
: "cc", "memory", "v0", "v1", "v2");
|
: "cc", "memory", "v0", "v1", "v2");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
static const uvec8 kYUV24Shuffle[3] =
|
static const uvec8 kYUV24Shuffle[3] = {
|
||||||
{{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 },
|
{16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
|
||||||
{ 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 },
|
{21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
|
||||||
{ 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }};
|
{10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
|
||||||
|
|
||||||
// Convert biplanar NV21 to packed YUV24
|
// Convert biplanar NV21 to packed YUV24
|
||||||
// NV21 has VU in memory for chroma.
|
// NV21 has VU in memory for chroma.
|
||||||
@ -3733,27 +3905,29 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_yuv24,
|
uint8_t* dst_yuv24,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants
|
"ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
|
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
|
||||||
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
|
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
|
||||||
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24
|
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n"
|
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
"prfm pldl1keep, [%1, 448] \n"
|
||||||
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n"
|
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
|
||||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||||
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels
|
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(src_vu), // %1
|
"+r"(src_vu), // %1
|
||||||
"+r"(dst_yuv24), // %2
|
"+r"(dst_yuv24), // %2
|
||||||
"+r"(width) // %3
|
"+r"(width) // %3
|
||||||
: "r"(&kYUV24Shuffle[0]) // %4
|
: "r"(&kYUV24Shuffle[0]) // %4
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||||
}
|
}
|
||||||
#endif // LIBYUV_USE_ST3
|
#endif // LIBYUV_USE_ST3
|
||||||
|
|
||||||
|
// Note ST2 8b version is faster than zip+ST1
|
||||||
|
|
||||||
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
|
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
|
||||||
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||||
int src_stride_ayuv,
|
int src_stride_ayuv,
|
||||||
@ -3915,32 +4089,6 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
|
|||||||
: "cc", "memory", "v0", "v1", "v2");
|
: "cc", "memory", "v0", "v1", "v2");
|
||||||
}
|
}
|
||||||
|
|
||||||
void MergeUVRow_16_NEON(const uint16_t* src_u,
|
|
||||||
const uint16_t* src_v,
|
|
||||||
uint16_t* dst_uv,
|
|
||||||
int depth,
|
|
||||||
int width) {
|
|
||||||
int shift = 16 - depth;
|
|
||||||
asm volatile(
|
|
||||||
"dup v2.8h, %w4 \n"
|
|
||||||
"1: \n"
|
|
||||||
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
|
|
||||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
|
||||||
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
|
|
||||||
"ushl v0.8h, v0.8h, v2.8h \n"
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"ushl v1.8h, v1.8h, v2.8h \n"
|
|
||||||
"prfm pldl1keep, [%1, 448] \n"
|
|
||||||
"st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_u), // %0
|
|
||||||
"+r"(src_v), // %1
|
|
||||||
"+r"(dst_uv), // %2
|
|
||||||
"+r"(width) // %3
|
|
||||||
: "r"(shift) // %4
|
|
||||||
: "cc", "memory", "v0", "v1", "v2");
|
|
||||||
}
|
|
||||||
|
|
||||||
void MultiplyRow_16_NEON(const uint16_t* src_y,
|
void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user