mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
RAWToNV21 using SME, SVE, I8MM or Neon
Pixel 9 Now SVE2 2 pass LibYUVConvertTest.RAWToNV21_Opt (364 ms) 31.76% libyuv::ARGBToUVMatrixRow_SVE_SC() 30.38% RAWToARGBRow_SVE2 26.81% ARGBToYMatrixRow_NEON_DotProd 3.26% MergeUVRow_NEON Was NEON 1 pass LibYUVConvertTest.RAWToJNV21_Opt (295 ms) 44.14% RAWToYJRow_NEON 41.91% RAWToUVJRow_NEON 5.11% MergeUVRow_NEON Clang on Intel Skylake clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (301 ms) visual c (row_win) [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (2056 ms) clang [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (275 ms) visual c [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (365 ms) Bug: libyuv:42280902 Change-Id: Iaba558ebe96ce6b9881ee9335ba72b8aac390cde Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7802432 Commit-Queue: Frank Barchard <fbarchard@google.com> Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
This commit is contained in:
parent
b438739c8b
commit
f2ac6db694
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1933
|
||||
Version: 1934
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -140,6 +140,13 @@ extern "C" {
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
// require VS2012, clang 3.4 or gcc 4.7.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || defined(__i386__) || \
|
||||
defined(_M_X64) || defined(_M_X86))
|
||||
#define HAS_ARGBTOUVMATRIXROW_AVX2
|
||||
#define HAS_MERGEUVROW_AVX2
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
|
||||
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
|
||||
defined(GCC_HAS_AVX2))
|
||||
@ -163,7 +170,6 @@ extern "C" {
|
||||
#define HAS_I444TORGB24ROW_AVX2
|
||||
#define HAS_INTERPOLATEROW_AVX2
|
||||
#define HAS_J422TOARGBROW_AVX2
|
||||
#define HAS_MERGEUVROW_AVX2
|
||||
#define HAS_MIRRORROW_AVX2
|
||||
#define HAS_NV12TOARGBROW_AVX2
|
||||
#define HAS_NV12TORGB24ROW_AVX2
|
||||
@ -427,6 +433,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOUV444ROW_NEON
|
||||
#define HAS_ARGBTOUVJ444ROW_NEON
|
||||
#define HAS_ARGBTOUVJROW_NEON
|
||||
#define HAS_ARGBTOUVMATRIXROW_NEON
|
||||
#define HAS_ARGBTOUVROW_NEON
|
||||
#define HAS_ARGBTOYJROW_NEON
|
||||
#if !defined(__aarch64__)
|
||||
@ -573,6 +580,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVJ444ROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVJROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVROW_NEON_I8MM
|
||||
#define HAS_BGRATOUVROW_NEON_I8MM
|
||||
#define HAS_RGBATOUVROW_NEON_I8MM
|
||||
@ -588,6 +596,7 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB565DITHERROW_SVE2
|
||||
#define HAS_ARGBTORGB565ROW_SVE2
|
||||
#define HAS_ARGBTOUVJROW_SVE2
|
||||
#define HAS_ARGBTOUVMATRIXROW_SVE2
|
||||
#define HAS_ARGBTOUVROW_SVE2
|
||||
#define HAS_AYUVTOUVROW_SVE2
|
||||
#define HAS_AYUVTOVUROW_SVE2
|
||||
@ -639,6 +648,7 @@ extern "C" {
|
||||
#define HAS_ABGRTOUVROW_SME
|
||||
#define HAS_ARGBMULTIPLYROW_SME
|
||||
#define HAS_ARGBTOUVJROW_SME
|
||||
#define HAS_ARGBTOUVMATRIXROW_SME
|
||||
#define HAS_ARGBTOUVROW_SME
|
||||
#define HAS_BGRATOUVROW_SME
|
||||
#define HAS_CONVERT16TO8ROW_SME
|
||||
@ -1834,6 +1844,43 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1933
|
||||
#define LIBYUV_VERSION 1934
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2245,6 +2245,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
|
||||
@ -574,6 +574,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
@ -915,6 +941,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
@ -4065,13 +4117,6 @@ int ARGBToAB64(const uint8_t* src_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#define HAS_RAWTOYJROW
|
||||
#endif
|
||||
|
||||
// RAW to JNV21 full range NV21
|
||||
LIBYUV_API
|
||||
// Convert RAW to NV21 with Matrix.
|
||||
LIBYUV_API
|
||||
int RAWToNV21Matrix(const uint8_t* src_raw,
|
||||
@ -4226,6 +4271,32 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
|
||||
RAWToARGBRow = RAWToARGBRow_RVV;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
|
||||
@ -2264,6 +2264,12 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
|
||||
memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_NEON
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
|
||||
#endif
|
||||
|
||||
@ -1918,6 +1918,72 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||
// clang-format on
|
||||
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile (
|
||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||
"vld1.8 {d18}, [%5] \n" // load kRGBToU
|
||||
"vld1.8 {d19}, [%6] \n" // load kRGBToV
|
||||
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
|
||||
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
|
||||
"vdup.16 q10, d16[0] \n" // U0
|
||||
"vdup.16 q11, d16[1] \n" // U1
|
||||
"vdup.16 q12, d16[2] \n" // U2
|
||||
"vdup.16 q13, d18[0] \n" // V0
|
||||
"vdup.16 q14, d18[1] \n" // V1
|
||||
"vdup.16 q15, d18[2] \n" // V2
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
|
||||
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // average of 4
|
||||
"vrshr.u16 q1, q1, #2 \n"
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
|
||||
"vmov.u16 q3, #0x8000 \n" // 128.0
|
||||
|
||||
"vmul.s16 q8, q0, q10 \n" // U = B * U0
|
||||
"vmla.s16 q8, q1, q11 \n" // U += G * U1
|
||||
"vmla.s16 q8, q2, q12 \n" // U += R * U2
|
||||
|
||||
"vmul.s16 q9, q0, q13 \n" // V = B * V0
|
||||
"vmla.s16 q9, q1, q14 \n" // V += G * V1
|
||||
"vmla.s16 q9, q2, q15 \n" // V += R * V2
|
||||
|
||||
"vsub.u16 q8, q3, q8 \n" // 128.0 - U
|
||||
"vsub.u16 q9, q3, q9 \n" // 128.0 - V
|
||||
|
||||
"vqshrn.u16 d0, q8, #8 \n" // Saturating shift right
|
||||
"vqshrn.u16 d1, q9, #8 \n"
|
||||
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stride_argb), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&c->kRGBToU), // %5
|
||||
"r"(&c->kRGBToV) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
@ -2893,14 +2894,26 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
// TODO(fbarchard): consider ptrdiff_t for all strides.
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||
asm volatile (
|
||||
RGBTOUV_SETUP_REG
|
||||
"ldr q16, [%[c], #16] \n" // kRGBToU
|
||||
"ldr q17, [%[c], #32] \n" // kRGBToV
|
||||
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
|
||||
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
|
||||
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
|
||||
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
|
||||
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
|
||||
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
|
||||
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
|
||||
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
|
||||
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
@ -2909,7 +2922,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
|
||||
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -2919,7 +2932,20 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
// U = B*U0 + G*U1 + R*U2
|
||||
"mul v3.8h, v0.8h, v20.8h \n"
|
||||
"mla v3.8h, v1.8h, v21.8h \n"
|
||||
"mla v3.8h, v2.8h, v22.8h \n"
|
||||
|
||||
// V = B*V0 + G*V1 + R*V2
|
||||
"mul v4.8h, v0.8h, v23.8h \n"
|
||||
"mla v4.8h, v1.8h, v24.8h \n"
|
||||
"mla v4.8h, v2.8h, v26.8h \n"
|
||||
|
||||
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
|
||||
"subhn v0.8b, v25.8h, v3.8h \n"
|
||||
"subhn v1.8b, v25.8h, v4.8h \n"
|
||||
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
@ -2928,12 +2954,21 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: [c] "r"(c) // %5
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -3449,7 +3484,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
}
|
||||
|
||||
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
|
||||
static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src,
|
||||
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -3546,12 +3581,25 @@ static const int8_t kRGBAToUVCoefficients[] = {
|
||||
0, -112, 74, 38, 0, 18, 94, -112,
|
||||
};
|
||||
|
||||
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3560,7 +3608,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3569,7 +3617,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
kBGRAToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3578,7 +3626,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
kRGBAToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3606,7 +3654,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVJCoefficients);
|
||||
}
|
||||
|
||||
@ -3615,7 +3663,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVJCoefficients);
|
||||
}
|
||||
|
||||
|
||||
@ -1120,6 +1120,20 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
|
||||
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ARGBToUVMatrixRow_SME(
|
||||
const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -217,6 +217,19 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
|
||||
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
|
||||
}
|
||||
|
||||
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -314,6 +314,95 @@ void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int wi
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
|
||||
LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
|
||||
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
__m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
|
||||
__m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
|
||||
__m256i ymm_0101 = _mm256_set1_epi16(0x0101);
|
||||
__m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
|
||||
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
|
||||
__m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
|
||||
__m256i ymm_zero = _mm256_setzero_si256();
|
||||
|
||||
while (width > 0) {
|
||||
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
|
||||
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
|
||||
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
|
||||
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
|
||||
|
||||
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
|
||||
ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
|
||||
ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
|
||||
ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf);
|
||||
|
||||
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101);
|
||||
ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101);
|
||||
ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101);
|
||||
ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101);
|
||||
|
||||
ymm0 = _mm256_add_epi16(ymm0, ymm2);
|
||||
ymm1 = _mm256_add_epi16(ymm1, ymm3);
|
||||
|
||||
ymm0 = _mm256_srli_epi16(ymm0, 1);
|
||||
ymm1 = _mm256_srli_epi16(ymm1, 1);
|
||||
ymm0 = _mm256_avg_epu16(ymm0, ymm_zero);
|
||||
ymm1 = _mm256_avg_epu16(ymm1, ymm_zero);
|
||||
|
||||
ymm0 = _mm256_packus_epi16(ymm0, ymm1);
|
||||
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
|
||||
|
||||
ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v);
|
||||
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u);
|
||||
|
||||
ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
|
||||
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
|
||||
ymm0 = _mm256_sub_epi16(ymm_8000, ymm0);
|
||||
ymm0 = _mm256_srli_epi16(ymm0, 8);
|
||||
ymm0 = _mm256_packus_epi16(ymm0, ymm0);
|
||||
|
||||
__m128i xmm_u = _mm256_castsi256_si128(ymm0);
|
||||
__m128i xmm_v = _mm256_extracti128_si256(ymm0, 1);
|
||||
|
||||
_mm_storel_epi64((__m128i*)dst_u, xmm_u);
|
||||
_mm_storel_epi64((__m128i*)dst_v, xmm_v);
|
||||
|
||||
src_argb += 64;
|
||||
dst_u += 8;
|
||||
dst_v += 8;
|
||||
width -= 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MERGEUVROW_AVX2
|
||||
LIBYUV_TARGET_AVX2
|
||||
void MergeUVRow_AVX2(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
while (width > 0) {
|
||||
__m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u));
|
||||
__m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v));
|
||||
|
||||
ymm1 = _mm256_slli_epi16(ymm1, 8);
|
||||
ymm0 = _mm256_or_si256(ymm0, ymm1);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dst_uv, ymm0);
|
||||
|
||||
src_u += 16;
|
||||
src_v += 16;
|
||||
dst_uv += 32;
|
||||
width -= 16;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user