mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Improve ARGBTOARGB4444 using SRI instructions
The existing sequence to convert from 8-bit ARGB to 4-bit ARGB4444 makes
use of a lot of shifts and bit-clears before ORR'ing the pairs together.
This is unnecessary since we can do the same with the SRI instruction,
so use that instead.
Reduction in runtime for selected kernels:
Kernel | Cortex-A55 | Cortex-A76
ARGBToARGB4444Row_NEON | -15.3% | -16.6%
I422ToARGB4444Row_NEON | -2.7% | -11.9%
Bug: libyuv:976
Change-Id: I86cd86c7adf1105558787a679272179821f31a9d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5383443
Reviewed-by: Justin Green <greenjustin@google.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
b265c311b7
commit
95b0a3326c
@ -365,15 +365,11 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||
: "cc", "memory", YUVTORGB_REGS, "v19");
|
||||
}
|
||||
|
||||
#define ARGBTOARGB4444 \
|
||||
/* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \
|
||||
"ushr v16.8b, v16.8b, #4 \n" /* B */ \
|
||||
"bic v17.8b, v17.8b, v23.8b \n" /* G */ \
|
||||
"ushr v18.8b, v18.8b, #4 \n" /* R */ \
|
||||
"bic v19.8b, v19.8b, v23.8b \n" /* A */ \
|
||||
"orr v0.8b, v16.8b, v17.8b \n" /* BG */ \
|
||||
"orr v1.8b, v18.8b, v19.8b \n" /* RA */ \
|
||||
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
|
||||
#define ARGBTOARGB4444 \
|
||||
/* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A */ \
|
||||
"sri v17.8b, v16.8b, #4 \n" /* BG */ \
|
||||
"sri v19.8b, v18.8b, #4 \n" /* RA */ \
|
||||
"zip1 v0.16b, v17.16b, v19.16b \n" /* BGRA */
|
||||
|
||||
void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
@ -383,8 +379,6 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v23.16b, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
@ -400,7 +394,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
||||
[width] "+r"(width) // %[width]
|
||||
: [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
|
||||
: "cc", "memory", YUVTORGB_REGS, "v19", "v23");
|
||||
: "cc", "memory", YUVTORGB_REGS, "v19");
|
||||
}
|
||||
|
||||
void I400ToARGBRow_NEON(const uint8_t* src_y,
|
||||
@ -2042,8 +2036,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb4444,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"movi v23.16b, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n"
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
|
||||
// pixels
|
||||
@ -2055,7 +2047,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
"+r"(dst_argb4444), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
|
||||
: "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19");
|
||||
}
|
||||
|
||||
#if LIBYUV_USE_ST2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user