[AArch64] Add SVE2 implementation of I422ToARGB4444Row

This makes use of the same approach as the Neon code to avoid redundant
narrowing and then widening shifts by instead placing the values at the
top portion of the lanes and then shifting down from there instead.

Observed reduction in runtime compared to the existing Neon code:

Cortex-A510: -35.5%
Cortex-A520: -38.2%
Cortex-A715: -19.8%
Cortex-A720: -19.8%
  Cortex-X2: -24.2%
  Cortex-X3: -24.1%
  Cortex-X4: -21.6%
Cortex-X925: -19.5%

Bug: b/42280942
Change-Id: I0a916600e7bdee0f5480ea843b44ab046bb3d082
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5802968
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-05-05 19:49:24 +01:00 committed by Frank Barchard
parent f4eaeca22a
commit 22ac86800e
3 changed files with 68 additions and 0 deletions

View File

@ -551,6 +551,7 @@ extern "C" {
#define HAS_I400TOARGBROW_SVE2 #define HAS_I400TOARGBROW_SVE2
#define HAS_I422ALPHATOARGBROW_SVE2 #define HAS_I422ALPHATOARGBROW_SVE2
#define HAS_I422TOARGB1555ROW_SVE2 #define HAS_I422TOARGB1555ROW_SVE2
#define HAS_I422TOARGB4444ROW_SVE2
#define HAS_I422TOARGBROW_SVE2 #define HAS_I422TOARGBROW_SVE2
#define HAS_I422TORGB24ROW_SVE2 #define HAS_I422TORGB24ROW_SVE2
#define HAS_I422TORGB565ROW_SVE2 #define HAS_I422TORGB565ROW_SVE2
@ -1207,6 +1208,12 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb4444, uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGB4444Row_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_NEON(const uint8_t* src_y, void NV12ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_uv, const uint8_t* src_uv,
uint8_t* dst_argb, uint8_t* dst_argb,

View File

@ -5795,6 +5795,11 @@ int I420ToARGB4444(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I422TOARGB4444ROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
I422ToARGB4444Row = I422ToARGB4444Row_SVE2;
}
#endif
#if defined(HAS_I422TOARGB4444ROW_MSA) #if defined(HAS_I422TOARGB4444ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;

View File

@ -495,6 +495,62 @@ void I422ToARGB1555Row_SVE2(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_SVE_REGS); : "cc", "memory", YUVTORGB_SVE_REGS);
} }
#define RGB8TOARGB4444_SVE_FROM_TOP_2X \
"dup z0.h, #0xf000 \n" /* 1111000000000000 */ \
"dup z1.h, #0xf000 \n" /* 1111000000000000 */ \
"sri z0.h, z18.h, #4 \n" /* 1111rrrrxxxxxxxx */ \
"sri z1.h, z22.h, #4 \n" /* 1111rrrrxxxxxxxx */ \
"sri z0.h, z17.h, #8 \n" /* 1111rrrrggggxxxx */ \
"sri z1.h, z21.h, #8 \n" /* 1111rrrrggggxxxx */ \
"sri z0.h, z16.h, #12 \n" /* 1111rrrrggggbbbb */ \
"sri z1.h, z20.h, #12 \n" /* 1111rrrrggggbbbb */
void I422ToARGB4444Row_SVE2(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
uint64_t vl;
asm volatile(
"cntb %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
"subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n"
// Run bulk of computation with an all-true predicate to avoid predicate
// generation overhead.
"ptrue p1.b \n"
"1: \n" READYUV422_SVE_2X
I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
"subs %w[width], %w[width], %w[vl] \n" //
RGB8TOARGB4444_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"incb %[dst], all, mul #2 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[width], %w[width], %w[vl] \n"
"b.eq 99f \n"
// Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" READYUV422_SVE_2X
I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst] "+r"(dst_argb4444), // %[dst]
[width] "+r"(width), // %[width]
[vl] "=&r"(vl) // %[vl]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_SVE_REGS);
}
void I422ToRGBARow_SVE2(const uint8_t* src_y, void I422ToRGBARow_SVE2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,