[AArch64] Improve ARGBTOARGB4444 using SRI instructions

The existing sequence to convert from 8-bit ARGB to 4-bit ARGB4444 makes
use of a lot of shifts and bit-clears before ORR'ing the pairs together.
This is unnecessary since we can do the same with the SRI instruction,
so use that instead.

Reduction in runtime for selected kernels:

                Kernel | Cortex-A55 | Cortex-A76
ARGBToARGB4444Row_NEON |     -15.3% |     -16.6%
I422ToARGB4444Row_NEON |      -2.7% |     -11.9%

Bug: libyuv:976
Change-Id: I86cd86c7adf1105558787a679272179821f31a9d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5383443
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-03-13 05:20:06 +00:00 committed by libyuv LUCI CQ
parent b265c311b7
commit 95b0a3326c

View File

@ -365,15 +365,11 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
: "cc", "memory", YUVTORGB_REGS, "v19");
}
#define ARGBTOARGB4444 \
/* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \
"ushr v16.8b, v16.8b, #4 \n" /* B */ \
"bic v17.8b, v17.8b, v23.8b \n" /* G */ \
"ushr v18.8b, v18.8b, #4 \n" /* R */ \
"bic v19.8b, v19.8b, v23.8b \n" /* A */ \
"orr v0.8b, v16.8b, v17.8b \n" /* BG */ \
"orr v1.8b, v18.8b, v19.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
#define ARGBTOARGB4444 \
/* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \
"sri v17.8b, v16.8b, #4 \n" /* BG */ \
"sri v19.8b, v18.8b, #4 \n" /* RA */ \
"zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@ -383,8 +379,6 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"movi v23.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
@ -400,7 +394,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
: "cc", "memory", YUVTORGB_REGS, "v19", "v23");
: "cc", "memory", YUVTORGB_REGS, "v19");
}
void I400ToARGBRow_NEON(const uint8_t* src_y,
@ -2042,8 +2036,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile(
"movi v23.16b, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
// pixels
@ -2055,7 +2047,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
"+r"(dst_argb4444), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19");
}
#if LIBYUV_USE_ST2