[AArch64] Add I8MM implementation of ARGBToUV444Row

We cannot use the standard dot-product instructions since the
coefficients multiplication results are both added and subtracted, but
I8MM supports mixed-sign dot products which work well here.  We need to
add an additional variant of the coefficient structs since we need
negative constants for the elements that were previously subtracted.

Reduction in runtimes observed compared to the previous Neon
implementation:

Cortex-A510: -37.3%
Cortex-A520: -31.1%
Cortex-A715: -37.1%
Cortex-A720: -37.0%
  Cortex-X2: -62.1%
  Cortex-X3: -62.2%
  Cortex-X4: -40.4%

Bug: libyuv:977
Change-Id: Idc3d9a6408c30e1bce3816a1ed926ecd76792236
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5712928
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-04-18 16:18:10 +01:00 committed by Frank Barchard
parent 302d29d1a8
commit a4ccf9940e
4 changed files with 79 additions and 6 deletions

View File

@ -578,6 +578,7 @@ extern "C" {
#define HAS_RGBATOYROW_NEON_DOTPROD
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUV444ROW_NEON_I8MM
#endif
// The following are available on AArch64 SVE platforms:
@ -1611,6 +1612,10 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -2118,6 +2123,10 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,

View File

@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
if (IS_ALIGNED(width, 8)) {
ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUV444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
ARGBToUV444Row = ARGBToUV444Row_Any_MSA;

View File

@ -2116,6 +2116,9 @@ ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM
ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7)
#endif
#ifdef HAS_YUY2TOUV422ROW_MSA
ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)

View File

@ -2711,17 +2711,22 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
);
}
struct RgbUVConstants {
struct RgbUVConstantsU8 {
uint8_t kRGBToU[4];
uint8_t kRGBToV[4];
};
struct RgbUVConstantsI8 {
int8_t kRGBToU[4];
int8_t kRGBToV[4];
};
// 8x1 pixels.
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
const struct RgbUVConstantsU8* rgbuvconstants) {
asm volatile(
"ldr d0, [%4] \n" // load rgbuvconstants
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
@ -2758,6 +2763,42 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
"v27", "v28", "v29");
}
void ARGBToUV444MatrixRow_NEON_I8MM(
const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstantsI8* rgbuvconstants) {
asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
"movi v29.16b, #0x80 \n" // 128.5
"1: \n"
"ldp q0, q1, [%[src]], #32 \n"
"movi v2.4s, #0 \n"
"movi v3.4s, #0 \n"
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"usdot v2.4s, v0.16b, v16.16b \n"
"usdot v3.4s, v1.16b, v16.16b \n"
"usdot v4.4s, v0.16b, v17.16b \n"
"usdot v5.4s, v1.16b, v17.16b \n"
"prfm pldl1keep, [%[src], 448] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
"uzp1 v0.8h, v2.8h, v3.8h \n"
"uzp1 v1.8h, v4.8h, v5.8h \n"
"addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned
"addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned
"str d0, [%[dst_u]], #8 \n" // store 8 pixels U.
"str d1, [%[dst_v]], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: [src] "+r"(src_argb), // %[src]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
"v29");
}
// RGB to bt601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = 74
@ -2766,15 +2807,27 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
// VG -0.7344 coefficient = 94
// VR 0.875 coefficient = 112 (ignored)
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = {
{112, 74, 38, 0},
{18, 94, 112, 0}};
static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = {
{112, -74, -38, 0},
{-18, -94, 112, 0}};
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kRgb24I601UVConstants);
&kRgb24I601UVConstantsU8);
}
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kRgb24I601UVConstantsI8);
}
#define RGBTOUV_SETUP_REG \