[AArch64] Port YUVToRGB color conversions to SME

Some of the color conversion kernels already have Streaming-SVE
implementations however many do not. We can re-use the existing SVE
implementation by moving it to a new shared row_sve.h header and marking
it with a "streaming-compatible" attribute to ensure it can be called
from both streaming and non-streaming execution modes.

As part of this move to a common header we also add duplicated
streaming-mode implementations of the following kernels that did not
previously have an SME implementation:

- I210AlphaToARGBRow_SME
- I210ToAR30Row_SME
- I210ToARGBRow_SME
- I212ToAR30Row_SME
- I212ToARGBRow_SME
- I400ToARGBRow_SME
- I410AlphaToARGBRow_SME
- I410ToAR30Row_SME
- I410ToARGBRow_SME
- I422AlphaToARGBRow_SME
- I422ToARGB1555Row_SME
- I422ToARGB4444Row_SME
- I422ToRGB24Row_SME
- I422ToRGB565Row_SME
- I422ToRGBARow_SME
- I444AlphaToARGBRow_SME
- NV12ToARGBRow_SME
- NV12ToRGB24Row_SME
- NV21ToARGBRow_SME
- NV21ToRGB24Row_SME
- P210ToAR30Row_SME
- P210ToARGBRow_SME
- P410ToAR30Row_SME
- P410ToARGBRow_SME
- UYVYToARGBRow_SME
- YUY2ToARGBRow_SME

Change-Id: I84583478e465351cbe6fc0ec65254c3009922e84
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6087804
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-12-05 12:49:07 +00:00 committed by Frank Barchard
parent c2e7f8389a
commit 7fd0bd197e
5 changed files with 2465 additions and 1594 deletions

View File

@ -594,7 +594,23 @@ extern "C" {
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_CONVERT16TO8ROW_SME
#define HAS_COPYROW_SME
#define HAS_I210ALPHATOARGBROW_SME
#define HAS_I210TOAR30ROW_SME
#define HAS_I210TOARGBROW_SME
#define HAS_I212TOAR30ROW_SME
#define HAS_I212TOARGBROW_SME
#define HAS_I400TOARGBROW_SME
#define HAS_I410ALPHATOARGBROW_SME
#define HAS_I410TOAR30ROW_SME
#define HAS_I410TOARGBROW_SME
#define HAS_I422ALPHATOARGBROW_SME
#define HAS_I422TOARGB1555ROW_SME
#define HAS_I422TOARGB4444ROW_SME
#define HAS_I422TOARGBROW_SME
#define HAS_I422TORGB24ROW_SME
#define HAS_I422TORGB565ROW_SME
#define HAS_I422TORGBAROW_SME
#define HAS_I444ALPHATOARGBROW_SME
#define HAS_I444TOARGBROW_SME
#define HAS_INTERPOLATEROW_16_SME
#define HAS_INTERPOLATEROW_16TO8_SME
@ -602,6 +618,15 @@ extern "C" {
#define HAS_MERGEUVROW_16_SME
#define HAS_MERGEUVROW_SME
#define HAS_MULTIPLYROW_16_SME
#define HAS_NV12TOARGBROW_SME
#define HAS_NV12TORGB24ROW_SME
#define HAS_NV21TOARGBROW_SME
#define HAS_NV21TORGB24ROW_SME
#define HAS_P210TOAR30ROW_SME
#define HAS_P210TOARGBROW_SME
#define HAS_P410TOAR30ROW_SME
#define HAS_P410TOARGBROW_SME
#define HAS_YUY2TOARGBROW_SME
#endif
// The following are available on AArch64 platforms:
@ -1089,6 +1114,13 @@ void I210AlphaToARGBRow_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I210AlphaToARGBRow_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1103,6 +1135,13 @@ void I410AlphaToARGBRow_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410AlphaToARGBRow_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1139,6 +1178,12 @@ void I210ToARGBRow_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I210ToARGBRow_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1151,6 +1196,12 @@ void I410ToARGBRow_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToARGBRow_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I210ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1163,6 +1214,12 @@ void I210ToAR30Row_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I210ToAR30Row_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1175,6 +1232,12 @@ void I410ToAR30Row_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I410ToAR30Row_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToARGBRow_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1187,6 +1250,12 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToARGBRow_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToAR30Row_NEON(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
@ -1199,6 +1268,12 @@ void I212ToAR30Row_SVE2(const uint16_t* src_y,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I212ToAR30Row_SME(const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* rgb_buf,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1237,6 +1312,13 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444AlphaToARGBRow_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1251,6 +1333,13 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1263,6 +1352,12 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1275,6 +1370,12 @@ void I422ToRGB24Row_SVE2(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1287,6 +1388,12 @@ void I422ToRGB565Row_SVE2(const uint8_t* src_y,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB565Row_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1299,6 +1406,12 @@ void I422ToARGB1555Row_SVE2(const uint8_t* src_y,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB1555Row_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -1311,6 +1424,12 @@ void I422ToARGB4444Row_SVE2(const uint8_t* src_y,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGB4444Row_SME(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
@ -1321,6 +1440,11 @@ void NV12ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_SME(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB565Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb565,
@ -1336,6 +1460,11 @@ void NV21ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToARGBRow_SME(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
@ -1346,6 +1475,11 @@ void NV12ToRGB24Row_SVE2(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToRGB24Row_SME(const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
@ -1356,6 +1490,11 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToRGB24Row_SME(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
@ -1368,6 +1507,10 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void YUY2ToARGBRow_SME(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
@ -1376,6 +1519,10 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void UYVYToARGBRow_SME(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -4957,6 +5104,10 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_SME(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I400ToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
@ -5511,6 +5662,11 @@ void P210ToARGBRow_SVE2(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void P210ToARGBRow_SME(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void P410ToARGBRow_NEON(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_argb,
@ -5521,6 +5677,11 @@ void P410ToARGBRow_SVE2(const uint16_t* y_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void P410ToARGBRow_SME(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void P210ToAR30Row_NEON(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_ar30,
@ -5531,6 +5692,11 @@ void P210ToAR30Row_SVE2(const uint16_t* y_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void P210ToAR30Row_SME(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void P410ToAR30Row_NEON(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_ar30,
@ -5541,6 +5707,11 @@ void P410ToAR30Row_SVE2(const uint16_t* y_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void P410ToAR30Row_SME(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width);
void P210ToARGBRow_Any_NEON(const uint16_t* y_buf,
const uint16_t* uv_buf,
uint8_t* dst_argb,

1772
include/libyuv/row_sve.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -982,6 +982,11 @@ int I010ToAR30Matrix(const uint16_t* src_y,
I210ToAR30Row = I210ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I210TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210ToAR30Row = I210ToAR30Row_SME;
}
#endif
#if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@ -1170,6 +1175,11 @@ int I012ToAR30Matrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
I212ToAR30Row = I212ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I212TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I212ToAR30Row = I212ToAR30Row_SME;
}
#endif
for (y = 0; y < height; ++y) {
I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
@ -1226,6 +1236,11 @@ int I210ToAR30Matrix(const uint16_t* src_y,
I210ToAR30Row = I210ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I210TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210ToAR30Row = I210ToAR30Row_SME;
}
#endif
#if defined(HAS_I210TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
@ -1394,6 +1409,11 @@ int I410ToAR30Matrix(const uint16_t* src_y,
I410ToAR30Row = I410ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I410TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToAR30Row = I410ToAR30Row_SME;
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@ -1469,6 +1489,11 @@ int I010ToARGBMatrix(const uint16_t* src_y,
I210ToARGBRow = I210ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I210TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210ToARGBRow = I210ToARGBRow_SME;
}
#endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
@ -1653,6 +1678,11 @@ int I012ToARGBMatrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
I212ToARGBRow = I212ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I212TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I212ToARGBRow = I212ToARGBRow_SME;
}
#endif
for (y = 0; y < height; ++y) {
I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -1715,6 +1745,11 @@ int I210ToARGBMatrix(const uint16_t* src_y,
I210ToARGBRow = I210ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I210TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210ToARGBRow = I210ToARGBRow_SME;
}
#endif
#if defined(HAS_I210TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I210ToARGBRow = I210ToARGBRow_Any_AVX2;
@ -1889,6 +1924,11 @@ int I410ToARGBMatrix(const uint16_t* src_y,
I410ToARGBRow = I410ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToARGBRow = I410ToARGBRow_SME;
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@ -1959,6 +1999,11 @@ int P010ToARGBMatrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
P210ToARGBRow = P210ToARGBRow_SVE2;
}
#endif
#if defined(HAS_P210TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P210ToARGBRow = P210ToARGBRow_SME;
}
#endif
for (y = 0; y < height; ++y) {
P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@ -2023,6 +2068,11 @@ int P210ToARGBMatrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
P210ToARGBRow = P210ToARGBRow_SVE2;
}
#endif
#if defined(HAS_P210TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P210ToARGBRow = P210ToARGBRow_SME;
}
#endif
for (y = 0; y < height; ++y) {
P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
@ -2085,6 +2135,11 @@ int P010ToAR30Matrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
P210ToAR30Row = P210ToAR30Row_SVE2;
}
#endif
#if defined(HAS_P210TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P210ToAR30Row = P210ToAR30Row_SME;
}
#endif
for (y = 0; y < height; ++y) {
P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
@ -2149,6 +2204,11 @@ int P210ToAR30Matrix(const uint16_t* src_y,
if (TestCpuFlag(kCpuHasSVE2)) {
P210ToAR30Row = P210ToAR30Row_SVE2;
}
#endif
#if defined(HAS_P210TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P210ToAR30Row = P210ToAR30Row_SME;
}
#endif
for (y = 0; y < height; ++y) {
P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width);
@ -2223,6 +2283,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
I422AlphaToARGBRow = I422AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
@ -2387,6 +2452,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
I422AlphaToARGBRow = I422AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
@ -2549,6 +2619,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y,
I444AlphaToARGBRow = I444AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
@ -2808,6 +2883,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y,
I210AlphaToARGBRow = I210AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@ -2943,6 +3023,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y,
I210AlphaToARGBRow = I210AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I210ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3;
@ -3076,6 +3161,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y,
I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@ -3219,6 +3309,11 @@ int I400ToARGBMatrix(const uint8_t* src_y,
I400ToARGBRow = I400ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I400TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I400ToARGBRow = I400ToARGBRow_SME;
}
#endif
#if defined(HAS_I400TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I400ToARGBRow = I400ToARGBRow_Any_MSA;
@ -4382,6 +4477,11 @@ int NV12ToARGBMatrix(const uint8_t* src_y,
NV12ToARGBRow = NV12ToARGBRow_SVE2;
}
#endif
#if defined(HAS_NV12TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
NV12ToARGBRow = NV12ToARGBRow_SME;
}
#endif
#if defined(HAS_NV12TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
@ -4477,6 +4577,11 @@ int NV21ToARGBMatrix(const uint8_t* src_y,
NV21ToARGBRow = NV21ToARGBRow_SVE2;
}
#endif
#if defined(HAS_NV21TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
NV21ToARGBRow = NV21ToARGBRow_SME;
}
#endif
#if defined(HAS_NV21TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
@ -4615,6 +4720,11 @@ int NV12ToRGB24Matrix(const uint8_t* src_y,
NV12ToRGB24Row = NV12ToRGB24Row_SVE2;
}
#endif
#if defined(HAS_NV12TORGB24ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
NV12ToRGB24Row = NV12ToRGB24Row_SME;
}
#endif
#if defined(HAS_NV12TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
@ -4686,6 +4796,11 @@ int NV21ToRGB24Matrix(const uint8_t* src_y,
NV21ToRGB24Row = NV21ToRGB24Row_SVE2;
}
#endif
#if defined(HAS_NV21TORGB24ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
NV21ToRGB24Row = NV21ToRGB24Row_SME;
}
#endif
#if defined(HAS_NV21TORGB24ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
@ -4890,6 +5005,11 @@ int YUY2ToARGBMatrix(const uint8_t* src_yuy2,
YUY2ToARGBRow = YUY2ToARGBRow_SVE2;
}
#endif
#if defined(HAS_YUY2TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
YUY2ToARGBRow = YUY2ToARGBRow_SME;
}
#endif
#if defined(HAS_YUY2TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
@ -4983,6 +5103,11 @@ int UYVYToARGBMatrix(const uint8_t* src_uyvy,
UYVYToARGBRow = UYVYToARGBRow_SVE2;
}
#endif
#if defined(HAS_UYVYTOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
UYVYToARGBRow = UYVYToARGBRow_SME;
}
#endif
#if defined(HAS_UYVYTOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
@ -5195,6 +5320,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
I422ToRGBARow = I422ToRGBARow_SVE2;
}
#endif
#if defined(HAS_I422TORGBAROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGBARow = I422ToRGBARow_SME;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
@ -5428,6 +5558,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
I422ToRGBARow = I422ToRGBARow_SVE2;
}
#endif
#if defined(HAS_I422TORGBAROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGBARow = I422ToRGBARow_SME;
}
#endif
#if defined(HAS_I422TORGBAROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGBARow = I422ToRGBARow_Any_MSA;
@ -5563,6 +5698,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
I422ToRGB24Row = I422ToRGB24Row_SVE2;
}
#endif
#if defined(HAS_I422TORGB24ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGB24Row = I422ToRGB24Row_SME;
}
#endif
#if defined(HAS_I422TORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
@ -5770,6 +5910,11 @@ int I422ToRGB24Matrix(const uint8_t* src_y,
I422ToRGB24Row = I422ToRGB24Row_SVE2;
}
#endif
#if defined(HAS_I422TORGB24ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGB24Row = I422ToRGB24Row_SME;
}
#endif
#if defined(HAS_I422TORGB24ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
@ -5902,6 +6047,11 @@ int I420ToARGB1555(const uint8_t* src_y,
I422ToARGB1555Row = I422ToARGB1555Row_SVE2;
}
#endif
#if defined(HAS_I422TOARGB1555ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToARGB1555Row = I422ToARGB1555Row_SME;
}
#endif
#if defined(HAS_I422TOARGB1555ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
@ -5996,6 +6146,11 @@ int I420ToARGB4444(const uint8_t* src_y,
I422ToARGB4444Row = I422ToARGB4444Row_SVE2;
}
#endif
#if defined(HAS_I422TOARGB4444ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToARGB4444Row = I422ToARGB4444Row_SME;
}
#endif
#if defined(HAS_I422TOARGB4444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
@ -6091,6 +6246,11 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
I422ToRGB565Row = I422ToRGB565Row_SVE2;
}
#endif
#if defined(HAS_I422TORGB565ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGB565Row = I422ToRGB565Row_SME;
}
#endif
#if defined(HAS_I422TORGB565ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
@ -6236,6 +6396,11 @@ int I422ToRGB565Matrix(const uint8_t* src_y,
I422ToRGB565Row = I422ToRGB565Row_SVE2;
}
#endif
#if defined(HAS_I422TORGB565ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I422ToRGB565Row = I422ToRGB565Row_SME;
}
#endif
#if defined(HAS_I422TORGB565ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
@ -7093,6 +7258,11 @@ static int I010ToAR30MatrixBilinear(const uint16_t* src_y,
I410ToAR30Row = I410ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I410TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToAR30Row = I410ToAR30Row_SME;
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@ -7212,6 +7382,11 @@ static int I210ToAR30MatrixLinear(const uint16_t* src_y,
I410ToAR30Row = I410ToAR30Row_SVE2;
}
#endif
#if defined(HAS_I410TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToAR30Row = I410ToAR30Row_SME;
}
#endif
#if defined(HAS_I410TOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410ToAR30Row = I410ToAR30Row_Any_SSSE3;
@ -7318,6 +7493,11 @@ static int I010ToARGBMatrixBilinear(const uint16_t* src_y,
I410ToARGBRow = I410ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToARGBRow = I410ToARGBRow_SME;
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@ -7436,6 +7616,11 @@ static int I210ToARGBMatrixLinear(const uint16_t* src_y,
I410ToARGBRow = I410ToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410ToARGBRow = I410ToARGBRow_SME;
}
#endif
#if defined(HAS_I410TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I410ToARGBRow = I410ToARGBRow_Any_AVX2;
@ -7551,6 +7736,11 @@ static int I420AlphaToARGBMatrixBilinear(
I444AlphaToARGBRow = I444AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
@ -7782,6 +7972,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y,
I444AlphaToARGBRow = I444AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I444ALPHATOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA;
@ -7962,6 +8157,11 @@ static int I010AlphaToARGBMatrixBilinear(
I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@ -8159,6 +8359,11 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y,
I410AlphaToARGBRow = I410AlphaToARGBRow_SVE2;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_SME;
}
#endif
#if defined(HAS_I410ALPHATOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3;
@ -8326,6 +8531,11 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
P410ToARGBRow = P410ToARGBRow_SVE2;
}
#endif
#if defined(HAS_P410TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P410ToARGBRow = P410ToARGBRow_SME;
}
#endif
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
@ -8432,6 +8642,11 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
P410ToARGBRow = P410ToARGBRow_SVE2;
}
#endif
#if defined(HAS_P410TOARGBROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P410ToARGBRow = P410ToARGBRow_SME;
}
#endif
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
@ -8524,6 +8739,11 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
P410ToAR30Row = P410ToAR30Row_SVE2;
}
#endif
#if defined(HAS_P410TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P410ToAR30Row = P410ToAR30Row_SME;
}
#endif
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
@ -8630,6 +8850,11 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
P410ToAR30Row = P410ToAR30Row_SVE2;
}
#endif
#if defined(HAS_P410TOAR30ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
P410ToAR30Row = P410ToAR30Row_SME;
}
#endif
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {

View File

@ -9,6 +9,7 @@
*/
#include "libyuv/row.h"
#include "libyuv/row_sve.h"
#ifdef __cplusplus
namespace libyuv {
@ -18,32 +19,6 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)
#define YUVTORGB_SVE_SETUP \
"ld1rb {z28.b}, p0/z, [%[kUVCoeff], #0] \n" \
"ld1rb {z29.b}, p0/z, [%[kUVCoeff], #1] \n" \
"ld1rb {z30.b}, p0/z, [%[kUVCoeff], #2] \n" \
"ld1rb {z31.b}, p0/z, [%[kUVCoeff], #3] \n" \
"ld1rh {z24.h}, p0/z, [%[kRGBCoeffBias], #0] \n" \
"ld1rh {z25.h}, p0/z, [%[kRGBCoeffBias], #2] \n" \
"ld1rh {z26.h}, p0/z, [%[kRGBCoeffBias], #4] \n" \
"ld1rh {z27.h}, p0/z, [%[kRGBCoeffBias], #6] \n"
// Read twice as much data from YUV, putting the even elements from the Y data
// in z0.h and odd elements in z1.h. U/V data is not duplicated, stored in
// z2.h/z3.h.
#define READYUV422_SVE_2X \
"ld1b {z0.b}, p1/z, [%[src_y]] \n" \
"ld1b {z2.h}, p1/z, [%[src_u]] \n" \
"ld1b {z3.h}, p1/z, [%[src_v]] \n" \
"incb %[src_y] \n" \
"inch %[src_u] \n" \
"inch %[src_v] \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n" \
"trn2 z1.b, z0.b, z0.b \n" \
"trn1 z0.b, z0.b, z0.b \n"
// Read twice as much data from YUV, putting the even elements from the Y data
// in z0.h and odd elements in z1.h.
#define READYUV444_SVE_2X \
@ -59,29 +34,6 @@ extern "C" {
"trn2 z1.b, z0.b, z0.b \n" \
"trn1 z0.b, z0.b, z0.b \n"
// The U/V component multiplies do not need to be duplicated in I422, we just
// need to combine them with Y0/Y1 correctly.
#define I422TORGB_SVE_2X \
"umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
"umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
"umullb z6.h, z30.b, z2.b \n" \
"umullb z4.h, z28.b, z2.b \n" /* DB */ \
"umullb z5.h, z29.b, z3.b \n" /* DR */ \
"umlalb z6.h, z31.b, z3.b \n" /* DG */ \
\
"add z17.h, z0.h, z26.h \n" /* G0 */ \
"add z21.h, z1.h, z26.h \n" /* G1 */ \
"add z16.h, z0.h, z4.h \n" /* B0 */ \
"add z20.h, z1.h, z4.h \n" /* B1 */ \
"add z18.h, z0.h, z5.h \n" /* R0 */ \
"add z22.h, z1.h, z5.h \n" /* R1 */ \
"uqsub z17.h, z17.h, z6.h \n" /* G0 */ \
"uqsub z21.h, z21.h, z6.h \n" /* G1 */ \
"uqsub z16.h, z16.h, z25.h \n" /* B0 */ \
"uqsub z20.h, z20.h, z25.h \n" /* B1 */ \
"uqsub z18.h, z18.h, z27.h \n" /* R0 */ \
"uqsub z22.h, z22.h, z27.h \n" /* R1 */
#define I444TORGB_SVE_2X \
"umulh z0.h, z24.h, z0.h \n" /* Y0 */ \
"umulh z1.h, z24.h, z1.h \n" /* Y1 */ \
@ -115,11 +67,6 @@ extern "C" {
"uqshrnt z17.b, z21.h, #6 \n" /* G1 */ \
"uqshrnt z18.b, z22.h, #6 \n" /* R1 */
#define YUVTORGB_SVE_REGS \
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", \
"z20", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", \
"z31", "p0", "p1", "p2", "p3"
__arm_locally_streaming void I444ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
@ -168,6 +115,15 @@ __arm_locally_streaming void I444ToARGBRow_SME(
: "cc", "memory", YUVTORGB_SVE_REGS);
}
__arm_locally_streaming void I400ToARGBRow_SME(
const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
// Streaming-SVE only, no use of ZA tile.
I400ToARGBRow_SVE_SC(src_y, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
@ -176,44 +132,255 @@ __arm_locally_streaming void I422ToARGBRow_SME(
const struct YuvConstants* yuvconstants,
int width) {
// Streaming-SVE only, no use of ZA tile.
uint64_t vl;
asm volatile(
"cntb %[vl] \n"
"ptrue p0.b \n" //
YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A0
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
I422ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.b \n"
"1: \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
"subs %w[width], %w[width], %w[vl] \n"
"st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
"incb %[dst_argb], all, mul #4 \n"
"b.ge 1b \n"
__arm_locally_streaming void I422ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGB24Row_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
__arm_locally_streaming void I422ToRGB565Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGB565Row_SVE_SC(src_y, src_u, src_v, dst_rgb565, yuvconstants, width);
}
// Calculate a predicate for the final iteration to deal with the tail.
"whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_2X
"st4b {z16.b, z17.b, z18.b, z19.b}, p1, [%[dst_argb]] \n"
__arm_locally_streaming void I422ToARGB1555Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
I422ToARGB1555Row_SVE_SC(src_y, src_u, src_v, dst_argb1555, yuvconstants,
width);
}
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_argb] "+r"(dst_argb), // %[dst_argb]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_SVE_REGS);
__arm_locally_streaming void I422ToARGB4444Row_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
I422ToARGB4444Row_SVE_SC(src_y, src_u, src_v, dst_argb4444, yuvconstants,
width);
}
__arm_locally_streaming void I422ToRGBARow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422ToRGBARow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I422AlphaToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I422AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I444AlphaToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I444AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void NV12ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
NV12ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void NV21ToARGBRow_SME(
const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
NV21ToARGBRow_SVE_SC(src_y, src_vu, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void NV12ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_uv,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
NV12ToRGB24Row_SVE_SC(src_y, src_uv, dst_rgb24, yuvconstants, width);
}
__arm_locally_streaming void NV21ToRGB24Row_SME(
const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
__arm_locally_streaming void YUY2ToARGBRow_SME(
const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
YUY2ToARGBRow_SVE_SC(src_yuy2, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void UYVYToARGBRow_SME(
const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
UYVYToARGBRow_SVE_SC(src_uyvy, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I210ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I210ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I210AlphaToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I210AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I210ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I210ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void P210ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
P210ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void P210ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
P210ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I410ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I410ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void I410AlphaToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
const uint16_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I410AlphaToARGBRow_SVE_SC(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
width);
}
__arm_locally_streaming void I410ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I410ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void P410ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
P410ToARGBRow_SVE_SC(src_y, src_uv, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void P410ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_uv,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
P410ToAR30Row_SVE_SC(src_y, src_uv, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I212ToAR30Row_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_ar30,
const struct YuvConstants* yuvconstants,
int width) {
I212ToAR30Row_SVE_SC(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
}
__arm_locally_streaming void I212ToARGBRow_SME(
const uint16_t* src_y,
const uint16_t* src_u,
const uint16_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
I212ToARGBRow_SVE_SC(src_y, src_u, src_v, dst_argb, yuvconstants, width);
}
__arm_locally_streaming void MultiplyRow_16_SME(const uint16_t* src_y,

File diff suppressed because it is too large Load Diff