mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
RAWToNV21 using SME, SVE, I8MM or Neon
New method uses Matrix function with 2 step conversion. Old method only had JNV21 with 1 step conversion. Pixel 9 [ OK ] LibYUVConvertTest.RAWToNV21_Opt (364 ms) 31.76% libyuv::ARGBToUVMatrixRow_SVE_SC() 30.38% RAWToARGBRow_SVE2 26.81% ARGBToYMatrixRow_NEON_DotProd 3.26% MergeUVRow_NEON Was [ OK ] LibYUVConvertTest.RAWToJNV21_Opt (295 ms) 44.14% RAWToYJRow_NEON 41.91% RAWToUVJRow_NEON 5.11% MergeUVRow_NEON Bug: libyuv:42280902 Change-Id: Iaba558ebe96ce6b9881ee9335ba72b8aac390cde
This commit is contained in:
parent
a7849e8a5e
commit
6fa597023f
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1933
|
||||
Version: 1934
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -427,6 +427,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOUV444ROW_NEON
|
||||
#define HAS_ARGBTOUVJ444ROW_NEON
|
||||
#define HAS_ARGBTOUVJROW_NEON
|
||||
#define HAS_ARGBTOUVMATRIXROW_NEON
|
||||
#define HAS_ARGBTOUVROW_NEON
|
||||
#define HAS_ARGBTOYJROW_NEON
|
||||
#if !defined(__aarch64__)
|
||||
@ -573,6 +574,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVJ444ROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVJROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
|
||||
#define HAS_ARGBTOUVROW_NEON_I8MM
|
||||
#define HAS_BGRATOUVROW_NEON_I8MM
|
||||
#define HAS_RGBATOUVROW_NEON_I8MM
|
||||
@ -588,6 +590,7 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB565DITHERROW_SVE2
|
||||
#define HAS_ARGBTORGB565ROW_SVE2
|
||||
#define HAS_ARGBTOUVJROW_SVE2
|
||||
#define HAS_ARGBTOUVMATRIXROW_SVE2
|
||||
#define HAS_ARGBTOUVROW_SVE2
|
||||
#define HAS_AYUVTOUVROW_SVE2
|
||||
#define HAS_AYUVTOVUROW_SVE2
|
||||
@ -639,6 +642,7 @@ extern "C" {
|
||||
#define HAS_ABGRTOUVROW_SME
|
||||
#define HAS_ARGBMULTIPLYROW_SME
|
||||
#define HAS_ARGBTOUVJROW_SME
|
||||
#define HAS_ARGBTOUVMATRIXROW_SME
|
||||
#define HAS_ARGBTOUVROW_SME
|
||||
#define HAS_BGRATOUVROW_SME
|
||||
#define HAS_CONVERT16TO8ROW_SME
|
||||
@ -1834,6 +1838,43 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1933
|
||||
#define LIBYUV_VERSION 1934
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2245,6 +2245,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
|
||||
@ -574,6 +574,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
@ -915,6 +941,32 @@ ARGBToUVMatrixRow_C;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
@ -4065,13 +4117,6 @@ int ARGBToAB64(const uint8_t* src_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#define HAS_RAWTOYJROW
|
||||
#endif
|
||||
|
||||
// RAW to JNV21 full range NV21
|
||||
LIBYUV_API
|
||||
// Convert RAW to NV21 with Matrix.
|
||||
LIBYUV_API
|
||||
int RAWToNV21Matrix(const uint8_t* src_raw,
|
||||
@ -4226,6 +4271,32 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
|
||||
RAWToARGBRow = RAWToARGBRow_RVV;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
|
||||
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
|
||||
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
|
||||
if (TestCpuFlag(kCpuHasSME)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
|
||||
|
||||
@ -2264,6 +2264,12 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
|
||||
memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_NEON
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
|
||||
ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
|
||||
#endif
|
||||
|
||||
@ -1918,6 +1918,72 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||
// clang-format on
|
||||
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile (
|
||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||
"vld1.8 {d18}, [%5] \n" // load kRGBToU
|
||||
"vld1.8 {d19}, [%6] \n" // load kRGBToV
|
||||
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
|
||||
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
|
||||
"vdup.16 q10, d16[0] \n" // U0
|
||||
"vdup.16 q11, d16[1] \n" // U1
|
||||
"vdup.16 q12, d16[2] \n" // U2
|
||||
"vdup.16 q13, d18[0] \n" // V0
|
||||
"vdup.16 q14, d18[1] \n" // V1
|
||||
"vdup.16 q15, d18[2] \n" // V2
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
|
||||
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // average of 4
|
||||
"vrshr.u16 q1, q1, #2 \n"
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
|
||||
"vmov.u16 q3, #0x8000 \n" // 128.0
|
||||
|
||||
"vmul.s16 q8, q0, q10 \n" // U = B * U0
|
||||
"vmla.s16 q8, q1, q11 \n" // U += G * U1
|
||||
"vmla.s16 q8, q2, q12 \n" // U += R * U2
|
||||
|
||||
"vmul.s16 q9, q0, q13 \n" // V = B * V0
|
||||
"vmla.s16 q9, q1, q14 \n" // V += G * V1
|
||||
"vmla.s16 q9, q2, q15 \n" // V += R * V2
|
||||
|
||||
"vsub.u16 q8, q3, q8 \n" // 128.0 - U
|
||||
"vsub.u16 q9, q3, q9 \n" // 128.0 - V
|
||||
|
||||
"vqshrn.u16 d0, q8, #8 \n" // Saturating shift right
|
||||
"vqshrn.u16 d1, q9, #8 \n"
|
||||
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_stride_argb), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(&c->kRGBToU), // %5
|
||||
"r"(&c->kRGBToV) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
@ -2893,14 +2894,26 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||
// TODO(fbarchard): consider ptrdiff_t for all strides.
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||
asm volatile (
|
||||
RGBTOUV_SETUP_REG
|
||||
"ldr q16, [%[c], #16] \n" // kRGBToU
|
||||
"ldr q17, [%[c], #32] \n" // kRGBToV
|
||||
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
|
||||
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
|
||||
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
|
||||
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
|
||||
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
|
||||
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
|
||||
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
|
||||
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
|
||||
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
@ -2909,7 +2922,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
|
||||
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -2919,7 +2932,20 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
// U = B*U0 + G*U1 + R*U2
|
||||
"mul v3.8h, v0.8h, v20.8h \n"
|
||||
"mla v3.8h, v1.8h, v21.8h \n"
|
||||
"mla v3.8h, v2.8h, v22.8h \n"
|
||||
|
||||
// V = B*V0 + G*V1 + R*V2
|
||||
"mul v4.8h, v0.8h, v23.8h \n"
|
||||
"mla v4.8h, v1.8h, v24.8h \n"
|
||||
"mla v4.8h, v2.8h, v26.8h \n"
|
||||
|
||||
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
|
||||
"subhn v0.8b, v25.8h, v3.8h \n"
|
||||
"subhn v1.8b, v25.8h, v4.8h \n"
|
||||
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
@ -2928,12 +2954,21 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: [c] "r"(c) // %5
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -3449,7 +3484,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
}
|
||||
|
||||
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
|
||||
static void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src,
|
||||
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -3546,12 +3581,25 @@ static const int8_t kRGBAToUVCoefficients[] = {
|
||||
0, -112, 74, 38, 0, 18, 94, -112,
|
||||
};
|
||||
|
||||
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3560,7 +3608,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3569,7 +3617,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
kBGRAToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3578,7 +3626,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
kRGBAToUVCoefficients);
|
||||
}
|
||||
|
||||
@ -3606,7 +3654,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVJCoefficients);
|
||||
}
|
||||
|
||||
@ -3615,7 +3663,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVJCoefficients);
|
||||
}
|
||||
|
||||
|
||||
@ -1120,6 +1120,20 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
|
||||
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ARGBToUVMatrixRow_SME(
|
||||
const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -217,6 +217,19 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
|
||||
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
|
||||
}
|
||||
|
||||
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user