mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
NV21ToYUV24 replace ST3 with ST1. ARGBToAR64 replace ST2 with ST1
On Samsung S8 Exynos M2 Was ST3 NV21ToYUV24_Opt (769 ms) Now ST1 NV21ToYUV24_Opt (473 ms) Was ST2 ARGBToAR64_Opt (1759 ms) Now ST1 ARGBToAR64_Opt (987 ms) Skylake Xeon, AVX2 version: Was NV21ToYUV24_Opt (885 ms) Now NV21ToYUV24_Opt (194 ms) Bug: b/204562143, b/124413599 Change-Id: Icc9cb64d822cd11937789a4e04fbb773b3e33aa3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3290664 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
a04e4f87fb
commit
000806f373
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1803
|
||||
Version: 1805
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -303,6 +303,7 @@ extern "C" {
|
||||
#define HAS_MERGEXRGBROW_SSE2
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_MIRRORUVROW_SSSE3
|
||||
#define HAS_NV21TOYUV24ROW_SSSE3
|
||||
#define HAS_P210TOAR30ROW_SSSE3
|
||||
#define HAS_P210TOARGBROW_SSSE3
|
||||
#define HAS_P410TOAR30ROW_SSSE3
|
||||
@ -319,6 +320,7 @@ extern "C" {
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#define HAS_SWAPUVROW_SSSE3
|
||||
|
||||
|
||||
#if defined(__x86_64__) || !defined(__pic__)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
// https://code.google.com/p/libyuv/issues/detail?id=517
|
||||
@ -355,6 +357,7 @@ extern "C" {
|
||||
#define HAS_MERGEXR64ROW_AVX2
|
||||
#define HAS_MERGEXRGB16TO8ROW_AVX2
|
||||
#define HAS_MERGEXRGBROW_AVX2
|
||||
#define HAS_NV21TOYUV24ROW_AVX2
|
||||
#define HAS_I210TOAR30ROW_AVX2
|
||||
#define HAS_I210TOARGBROW_AVX2
|
||||
#define HAS_I212TOAR30ROW_AVX2
|
||||
@ -379,8 +382,6 @@ extern "C" {
|
||||
#define HAS_SPLITXRGBROW_AVX2
|
||||
#define HAS_SPLITUVROW_16_AVX2
|
||||
#define HAS_SWAPUVROW_AVX2
|
||||
// TODO(fbarchard): Fix AVX2 version of YUV24
|
||||
// #define HAS_NV21TOYUV24ROW_AVX2
|
||||
|
||||
#if defined(__x86_64__) || !defined(__pic__)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
@ -3227,6 +3228,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width);
|
||||
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
@ -3568,6 +3573,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width);
|
||||
void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1803
|
||||
#define LIBYUV_VERSION 1805
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
@ -3925,6 +3925,14 @@ int NV21ToYUV24(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_NV21TOYUV24ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
NV21ToYUV24Row = NV21ToYUV24Row_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_NV21TOYUV24ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
|
||||
|
||||
@ -545,21 +545,21 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
|
||||
#undef ANY31PT
|
||||
|
||||
// Any 2 planes to 1.
|
||||
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8_t temp[64 * 3]); \
|
||||
memset(temp, 0, 64 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
||||
SS(r, UVSHIFT) * SBPP2); \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
|
||||
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
|
||||
memset(temp, 0, 128 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \
|
||||
SS(r, UVSHIFT) * SBPP2); \
|
||||
ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \
|
||||
}
|
||||
|
||||
// Merge functions.
|
||||
@ -581,6 +581,9 @@ ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
|
||||
#ifdef HAS_NV21TOYUV24ROW_NEON
|
||||
ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_NV21TOYUV24ROW_SSSE3
|
||||
ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_NV21TOYUV24ROW_AVX2
|
||||
ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
|
||||
#endif
|
||||
|
||||
@ -8894,127 +8894,135 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
|
||||
|
||||
#ifdef HAS_NV21TOYUV24ROW_AVX2
|
||||
static const uvec8 kYUV24Shuffle[3] =
|
||||
{{ 8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12 },
|
||||
{ 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15 },
|
||||
{ 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7 }};
|
||||
|
||||
// begin NV21ToYUV24Row_C avx2 constants
|
||||
static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
|
||||
0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
|
||||
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
|
||||
0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
// NV21 has VU in memory for chroma.
|
||||
// YUV24 is VUY in memory
|
||||
void NV21ToYUV24Row_SSSE3(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"sub %0,%1 \n"
|
||||
"movdqa (%4),%%xmm4 \n" // 3 shuffler constants
|
||||
"movdqa 16(%4),%%xmm5 \n"
|
||||
"movdqa 32(%4),%%xmm6 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm2 \n" // load 16 Y values
|
||||
"movdqu (%0,%1),%%xmm3 \n" // load 8 VU values
|
||||
"lea 16(%0),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm1 \n"
|
||||
"shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3
|
||||
"shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5
|
||||
"shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7
|
||||
"pshufb %%xmm4, %%xmm0 \n" // weave into YUV24
|
||||
"pshufb %%xmm5, %%xmm1 \n"
|
||||
"pshufb %%xmm6, %%xmm2 \n"
|
||||
"movdqu %%xmm0,(%2) \n"
|
||||
"movdqu %%xmm1,16(%2) \n"
|
||||
"movdqu %%xmm2,32(%2) \n"
|
||||
"lea 48(%2),%2 \n"
|
||||
"sub $16,%3 \n" // 16 pixels per loop
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_yuv24), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kYUV24Shuffle[0]) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
|
||||
static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
|
||||
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
|
||||
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
|
||||
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
|
||||
|
||||
static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
|
||||
0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
|
||||
0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
|
||||
0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
|
||||
|
||||
static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
|
||||
0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
|
||||
0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
|
||||
0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
|
||||
|
||||
static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
|
||||
0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
|
||||
0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
|
||||
0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
|
||||
|
||||
static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
|
||||
0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
|
||||
0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
|
||||
0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
|
||||
|
||||
static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
|
||||
0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
|
||||
0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
|
||||
0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
|
||||
|
||||
static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
|
||||
0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
|
||||
0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
|
||||
0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
|
||||
|
||||
static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
|
||||
0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
|
||||
0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
|
||||
0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
|
||||
|
||||
// NV21ToYUV24Row_AVX2
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
// NV21 has VU in memory for chroma.
|
||||
// YUV24 is VUY in memory
|
||||
void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
uint8_t* src_y_ptr;
|
||||
uint64_t src_offset = 0;
|
||||
uint64_t width64;
|
||||
|
||||
width64 = width;
|
||||
src_y_ptr = (uint8_t*)src_y;
|
||||
|
||||
asm volatile(
|
||||
"vmovdqu %5, %%ymm0 \n" // init blend value
|
||||
"vmovdqu %6, %%ymm1 \n" // init blend value
|
||||
"vmovdqu %7, %%ymm2 \n" // init blend value
|
||||
// "sub $0x20, %3 \n" //sub 32 from
|
||||
// width for final loop
|
||||
"sub %0,%1 \n"
|
||||
"vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants
|
||||
"vbroadcastf128 16(%4),%%ymm5 \n"
|
||||
"vbroadcastf128 32(%4),%%ymm6 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n" // label 1
|
||||
"vmovdqu (%0,%4), %%ymm3 \n" // src_y
|
||||
"vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
|
||||
"vmovdqu (%1), %%ymm5 \n" // src_uv
|
||||
"vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
|
||||
"vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
|
||||
// shuf
|
||||
"vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
|
||||
// shuf
|
||||
"vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
|
||||
"vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
|
||||
// shuf
|
||||
"vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
|
||||
"vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
|
||||
"vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
|
||||
"vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
|
||||
"vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
|
||||
"vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
|
||||
"vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
|
||||
"vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
|
||||
"add $0x20, %4 \n" // add to src buffer
|
||||
// ptr
|
||||
"vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
|
||||
"vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
|
||||
"vmovdqu %%ymm4, (%2) \n" // store dst_yuv
|
||||
"vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
|
||||
"add $0x60,%2 \n" // add to dst buffer
|
||||
// ptr
|
||||
// "cmp %3, %4 \n" //(width64 -
|
||||
// 32 bytes) and src_offset
|
||||
"sub $0x20,%3 \n" // 32 pixels per loop
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n" // sse-avx2
|
||||
// transistions
|
||||
|
||||
: "+r"(src_y), //%0
|
||||
"+r"(src_vu), //%1
|
||||
"+r"(dst_yuv24), //%2
|
||||
"+r"(width64), //%3
|
||||
"+r"(src_offset) //%4
|
||||
: "m"(kBLEND0), //%5
|
||||
"m"(kBLEND1), //%6
|
||||
"m"(kBLEND2), //%7
|
||||
"m"(kSHUF0), //%8
|
||||
"m"(kSHUF1), //%9
|
||||
"m"(kSHUF2), //%10
|
||||
"m"(kSHUF3), //%11
|
||||
"m"(kSHUF4), //%12
|
||||
"m"(kSHUF5) //%13
|
||||
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
|
||||
"xmm13", "xmm14", "xmm15");
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
||||
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
||||
"lea 32(%0),%0 \n"
|
||||
"vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3
|
||||
"vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5
|
||||
"vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7
|
||||
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24
|
||||
"vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
|
||||
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
|
||||
"vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n"
|
||||
"vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n"
|
||||
"vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm3,(%2) \n"
|
||||
"vmovdqu %%ymm0,32(%2) \n"
|
||||
"vmovdqu %%ymm1,64(%2) \n"
|
||||
"lea 96(%2),%2 \n"
|
||||
"sub $32,%3 \n" // 32 pixels per loop
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_yuv24), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kYUV24Shuffle[0]) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif // HAS_NV21TOYUV24ROW_AVX2
|
||||
|
||||
#ifdef HAS_NV21ToYUV24ROW_AVX512
|
||||
// The following VMBI VEX256 code tests okay with the intelsde emulator.
|
||||
static const lvec8 kYUV24Perm[3] =
|
||||
{{ 32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36,
|
||||
37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43 },
|
||||
{ 10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15,
|
||||
48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52 },
|
||||
{ 53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59,
|
||||
26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31 }};
|
||||
|
||||
void NV21ToYUV24Row_AVX512(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"sub %0,%1 \n"
|
||||
"vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants
|
||||
"vmovdqa 32(%4),%%ymm5 \n"
|
||||
"vmovdqa 64(%4),%%ymm6 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm2 \n" // load 32 Y values
|
||||
"vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values
|
||||
"lea 32(%0),%0 \n"
|
||||
"vmovdqa %%ymm2, %%ymm0 \n"
|
||||
"vmovdqa %%ymm2, %%ymm1 \n"
|
||||
"vpermt2b %%ymm3,%%ymm4,%%ymm0 \n"
|
||||
"vpermt2b %%ymm3,%%ymm5,%%ymm1 \n"
|
||||
"vpermt2b %%ymm3,%%ymm6,%%ymm2 \n"
|
||||
"vmovdqu %%ymm0,(%2) \n"
|
||||
"vmovdqu %%ymm1,32(%2) \n"
|
||||
"vmovdqu %%ymm2,64(%2) \n"
|
||||
"lea 96(%2),%2 \n"
|
||||
"sub $32,%3 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_yuv24), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kYUV24Perm[0]) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
|
||||
#endif // HAS_NV21ToYUV24ROW_AVX512
|
||||
|
||||
#ifdef HAS_SWAPUVROW_SSSE3
|
||||
|
||||
|
||||
@ -15,6 +15,9 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them.
|
||||
// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
|
||||
|
||||
// This module is for GCC Neon armv8 64 bit.
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
@ -1683,6 +1686,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
|
||||
}
|
||||
|
||||
#if LIBYUV_USE_ST2
|
||||
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
uint16_t* dst_ar64,
|
||||
int width) {
|
||||
@ -1702,6 +1706,28 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
#else
|
||||
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
uint16_t* dst_ar64,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"zip1 v2.16b, v0.16b, v0.16b \n"
|
||||
"zip2 v3.16b, v0.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"zip1 v4.16b, v1.16b, v1.16b \n"
|
||||
"zip2 v5.16b, v1.16b, v1.16b \n"
|
||||
"st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_ar64), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
}
|
||||
#endif // LIBYUV_USE_ST2
|
||||
|
||||
static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
|
||||
10, 9, 8, 11, 14, 13, 12, 15};
|
||||
@ -3669,6 +3695,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
|
||||
}
|
||||
|
||||
#if LIBYUV_USE_ST3
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
@ -3692,8 +3719,42 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
#else
|
||||
static const uvec8 kYUV24Shuffle[3] =
|
||||
{{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 },
|
||||
{ 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 },
|
||||
{ 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }};
|
||||
|
||||
// AYUV is YVUA in memory. UV for NV12 is UV order in memory.
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
// NV21 has VU in memory for chroma.
|
||||
// YUV24 is VUY in memory
|
||||
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
|
||||
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n"
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_vu), // %1
|
||||
"+r"(dst_yuv24), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kYUV24Shuffle[0]) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
#endif // LIBYUV_USE_ST3
|
||||
|
||||
// AYUV is VUYA in memory. UV for NV12 is UV order in memory.
|
||||
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
int src_stride_ayuv,
|
||||
uint8_t* dst_uv,
|
||||
@ -3708,8 +3769,8 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v2.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
@ -3737,8 +3798,8 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user