mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
[AArch64] Add SVE2 implementation of I422ToRGB565Row
This makes use of the same approach as the Neon code to avoid redundant narrowing and then widening shifts by instead placing the values at the top portion of the lanes and then shifting down from there instead. Observed reduction in runtime compared to the existing Neon code: Cortex-A510: -41.1% Cortex-A520: -38.2% Cortex-A715: -21.5% Cortex-A720: -21.6% Cortex-X2: -21.6% Cortex-X3: -22.0% Cortex-X4: -23.5% Cortex-X925: -21.7% Bug: b/42280942 Change-Id: Id84872141435566bbf94a4bbf0227554b5b5fb91 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5802966 Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
4621b0cc7f
commit
f40042533c
@ -552,6 +552,7 @@ extern "C" {
|
|||||||
#define HAS_I422ALPHATOARGBROW_SVE2
|
#define HAS_I422ALPHATOARGBROW_SVE2
|
||||||
#define HAS_I422TOARGBROW_SVE2
|
#define HAS_I422TOARGBROW_SVE2
|
||||||
#define HAS_I422TORGB24ROW_SVE2
|
#define HAS_I422TORGB24ROW_SVE2
|
||||||
|
#define HAS_I422TORGB565ROW_SVE2
|
||||||
#define HAS_I422TORGBAROW_SVE2
|
#define HAS_I422TORGBAROW_SVE2
|
||||||
#define HAS_I444ALPHATOARGBROW_SVE2
|
#define HAS_I444ALPHATOARGBROW_SVE2
|
||||||
#define HAS_I444TOARGBROW_SVE2
|
#define HAS_I444TOARGBROW_SVE2
|
||||||
@ -1181,6 +1182,12 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb565,
|
uint8_t* dst_rgb565,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width);
|
int width);
|
||||||
|
void I422ToRGB565Row_SVE2(const uint8_t* src_y,
|
||||||
|
const uint8_t* src_u,
|
||||||
|
const uint8_t* src_v,
|
||||||
|
uint8_t* dst_rgb565,
|
||||||
|
const struct YuvConstants* yuvconstants,
|
||||||
|
int width);
|
||||||
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||||
const uint8_t* src_u,
|
const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
|
|||||||
@ -5880,6 +5880,11 @@ int I420ToRGB565Matrix(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_I422TORGB565ROW_SVE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||||
|
I422ToRGB565Row = I422ToRGB565Row_SVE2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_I422TORGB565ROW_MSA)
|
#if defined(HAS_I422TORGB565ROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
|
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
|
||||||
@ -6020,6 +6025,11 @@ int I422ToRGB565Matrix(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_I422TORGB565ROW_SVE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSVE2)) {
|
||||||
|
I422ToRGB565Row = I422ToRGB565Row_SVE2;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_I422TORGB565ROW_MSA)
|
#if defined(HAS_I422TORGB565ROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
|
I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
|
||||||
|
|||||||
@ -181,6 +181,15 @@ extern "C" {
|
|||||||
"uqshrnt z17.b, z21.h, #6 \n" /* G1 */ \
|
"uqshrnt z17.b, z21.h, #6 \n" /* G1 */ \
|
||||||
"uqshrnt z18.b, z22.h, #6 \n" /* R1 */
|
"uqshrnt z18.b, z22.h, #6 \n" /* R1 */
|
||||||
|
|
||||||
|
#define RGBTOARGB8_SVE_TOP_2X \
|
||||||
|
/* Inputs: B: z16.h, G: z17.h, R: z18.h */ \
|
||||||
|
"uqshl z16.h, p0/m, z16.h, #2 \n" /* B0 */ \
|
||||||
|
"uqshl z17.h, p0/m, z17.h, #2 \n" /* G0 */ \
|
||||||
|
"uqshl z18.h, p0/m, z18.h, #2 \n" /* R0 */ \
|
||||||
|
"uqshl z20.h, p0/m, z20.h, #2 \n" /* B1 */ \
|
||||||
|
"uqshl z21.h, p0/m, z21.h, #2 \n" /* G1 */ \
|
||||||
|
"uqshl z22.h, p0/m, z22.h, #2 \n" /* R1 */
|
||||||
|
|
||||||
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
|
// Convert from 2.14 fixed point RGB to 8 bit RGBA, interleaving as AB and GR
|
||||||
// pairs to allow us to use ST2 for storing rather than ST4.
|
// pairs to allow us to use ST2 for storing rather than ST4.
|
||||||
#define RGBTORGBA8_SVE \
|
#define RGBTORGBA8_SVE \
|
||||||
@ -377,6 +386,59 @@ void I422ToRGB24Row_SVE2(const uint8_t* src_y,
|
|||||||
: "cc", "memory", YUVTORGB_SVE_REGS);
|
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define RGB8TORGB565_SVE_FROM_TOP_2X \
|
||||||
|
"sri z18.h, z17.h, #5 \n" /* rrrrrgggggg00000 */ \
|
||||||
|
"sri z22.h, z21.h, #5 \n" /* rrrrrgggggg00000 */ \
|
||||||
|
"sri z18.h, z16.h, #11 \n" /* rrrrrggggggbbbbb */ \
|
||||||
|
"sri z22.h, z20.h, #11 \n" /* rrrrrggggggbbbbb */ \
|
||||||
|
"mov z19.d, z22.d \n"
|
||||||
|
|
||||||
|
void I422ToRGB565Row_SVE2(const uint8_t* src_y,
|
||||||
|
const uint8_t* src_u,
|
||||||
|
const uint8_t* src_v,
|
||||||
|
uint8_t* dst_rgb565,
|
||||||
|
const struct YuvConstants* yuvconstants,
|
||||||
|
int width) {
|
||||||
|
uint64_t vl;
|
||||||
|
asm volatile(
|
||||||
|
"cntb %[vl] \n"
|
||||||
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
|
"b.lt 2f \n"
|
||||||
|
|
||||||
|
// Run bulk of computation with an all-true predicate to avoid predicate
|
||||||
|
// generation overhead.
|
||||||
|
"ptrue p1.b \n"
|
||||||
|
"1: \n" READYUV422_SVE_2X
|
||||||
|
I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
|
||||||
|
"subs %w[width], %w[width], %w[vl] \n" //
|
||||||
|
RGB8TORGB565_SVE_FROM_TOP_2X
|
||||||
|
"st2h {z18.h, z19.h}, p1, [%[dst]] \n"
|
||||||
|
"incb %[dst], all, mul #2 \n"
|
||||||
|
"b.ge 1b \n"
|
||||||
|
|
||||||
|
"2: \n"
|
||||||
|
"adds %w[width], %w[width], %w[vl] \n"
|
||||||
|
"b.eq 99f \n"
|
||||||
|
|
||||||
|
// Calculate a predicate for the final iteration to deal with the tail.
|
||||||
|
"cnth %[vl] \n"
|
||||||
|
"whilelt p1.b, wzr, %w[width] \n" READYUV422_SVE_2X
|
||||||
|
I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
|
||||||
|
"st2h {z18.h, z19.h}, p1, [%[dst]] \n"
|
||||||
|
|
||||||
|
"99: \n"
|
||||||
|
: [src_y] "+r"(src_y), // %[src_y]
|
||||||
|
[src_u] "+r"(src_u), // %[src_u]
|
||||||
|
[src_v] "+r"(src_v), // %[src_v]
|
||||||
|
[dst] "+r"(dst_rgb565), // %[dst]
|
||||||
|
[width] "+r"(width), // %[width]
|
||||||
|
[vl] "=&r"(vl) // %[vl]
|
||||||
|
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||||
|
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
|
||||||
|
: "cc", "memory", YUVTORGB_SVE_REGS);
|
||||||
|
}
|
||||||
|
|
||||||
void I422ToRGBARow_SVE2(const uint8_t* src_y,
|
void I422ToRGBARow_SVE2(const uint8_t* src_y,
|
||||||
const uint8_t* src_u,
|
const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user