[AArch64] Avoid lane-indexed loads for UV when loading I444/I422

Most micro-architectures seem to prefer an additional ZIP1 instruction
in READYUV422 to needing a lane-indexed LD1 load instruction.

We introduce a new macro to handle the YUV to RGB conversion where the U
and V components are in separate vectors. This avoids causing a slowdown
for the UV-interleaved input format kernels (NV12 and NV21) where we do
not want to separate them.

Reduction in runtime for selected kernels on Cortex cores (no
performance difference observed on Cortex-A55):

                           A510     A76    A720      X1      X2
 I422AlphaToARGBRow_NEON  -4.3%   -7.3%  -10.1%   -4.0%   -4.4%
  I422ToARGB1555Row_NEON  -4.5%   +0.4%   -7.9%   -4.8%   -3.9%
  I422ToARGB4444Row_NEON  -7.7%   -2.6%   -4.1%   -1.9%   -1.3%
      I422ToARGBRow_NEON  -3.7%   -2.9%  -10.2%   -3.8%   -4.4%
     I422ToRGB24Row_NEON  -5.9%   +5.4%   -3.2%   -4.3%   -4.3%
    I422ToRGB565Row_NEON  -4.8%   -2.8%   -8.5%   -3.8%   -4.6%
      I422ToRGBARow_NEON  -3.7%   +4.6%  -10.5%   -3.0%   -4.5%
 I444AlphaToARGBRow_NEON  -3.5%   +2.7%   -3.7%   -5.0%   -8.2%
      I444ToARGBRow_NEON  -1.8%  -15.1%   -3.5%   -6.5%   -8.1%
     I444ToRGB24Row_NEON  -2.0%   -6.8%   +0.1%   -4.7%   +1.2%

There are a few cases which are slower on Cortex-A76, but significant
speedups elsewhere.

Bug: libyuv:976
Change-Id: Ib3b4ef81f7bfc1d7ff9c4c24aef9ad86741410ff
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5465580
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-03-14 07:36:54 +00:00 committed by Frank Barchard
parent bfedc8bc11
commit 1ca7c4e1cc

View File

@ -29,10 +29,11 @@ extern "C" {
#define READYUV422 \
"ldr d0, [%[src_y]], #8 \n" \
"ldr s1, [%[src_u]], #4 \n" \
"ld1 {v1.s}[1], [%[src_v]], #4 \n" \
"ldr s2, [%[src_v]], #4 \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"zip1 v1.16b, v1.16b, v1.16b \n" \
"zip1 v1.8b, v1.8b, v1.8b \n" \
"zip1 v2.8b, v2.8b, v2.8b \n" \
"prfm pldl1keep, [%[src_u], 128] \n" \
"prfm pldl1keep, [%[src_v], 128] \n"
@ -40,8 +41,8 @@ extern "C" {
#define READYUV444 \
"ldr d0, [%[src_y]], #8 \n" \
"ldr d1, [%[src_u]], #8 \n" \
"ldr d2, [%[src_v]], #8 \n" \
"prfm pldl1keep, [%[src_y], 448] \n" \
"ld1 {v1.d}[1], [%[src_v]], #8 \n" \
"prfm pldl1keep, [%[src_u], 448] \n" \
"zip1 v0.16b, v0.16b, v0.16b \n" \
"prfm pldl1keep, [%[src_v], 448] \n"
@ -94,8 +95,9 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
// v17.8h: G
// v18.8h: R
// Convert from YUV to 2.14 fixed point RGB
#define YUVTORGB \
// Convert from YUV (NV12 or NV21) to 2.14 fixed point RGB.
// Similar to I4XXTORGB but U/V components are in the low/high halves of v1.
#define NVTORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
"umull v6.8h, v1.8b, v30.8b \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
@ -110,6 +112,23 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
"uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
"uqsub v18.8h, v18.8h, v27.8h \n" /* R */
// Convert from YUV (I444 or I420) to 2.14 fixed point RGB.
// Similar to NVTORGB but U/V components are in v1/v2.
#define I4XXTORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
"umull v6.8h, v1.8b, v30.8b \n" \
"umull v0.4s, v0.4h, v24.4h \n" \
"umlal v6.8h, v2.8b, v31.8b \n" /* DG */ \
"uzp2 v0.8h, v0.8h, v3.8h \n" /* Y */ \
"umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
"umull v5.8h, v2.8b, v29.8b \n" /* DR */ \
"add v17.8h, v0.8h, v26.8h \n" /* G */ \
"add v16.8h, v0.8h, v4.8h \n" /* B */ \
"add v18.8h, v0.8h, v5.8h \n" /* R */ \
"uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
"uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
"uqsub v18.8h, v18.8h, v27.8h \n" /* R */
// Convert from YUV I400 to 2.14 fixed point RGB
#define I400TORGB \
"umull2 v3.4s, v0.8h, v24.8h \n" \
@ -128,9 +147,9 @@ static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9, 9, 13, 13,
"uqshrn v16.8b, v16.8h, #6 \n" \
"uqshrn v18.8b, v18.8h, #6 \n"
#define YUVTORGB_REGS \
"v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
"v26", "v27", "v28", "v29", "v30", "v31"
#define YUVTORGB_REGS \
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", \
"v25", "v26", "v27", "v28", "v29", "v30", "v31"
void I444ToARGBRow_NEON(const uint8_t* src_y,
const uint8_t* src_u,
@ -141,7 +160,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV444 YUVTORGB
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
@ -164,7 +183,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
@ -188,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
@ -214,7 +233,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
"prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -240,7 +259,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
"prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -264,7 +283,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v15.8b, #255 \n" /* A */
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
@ -287,7 +306,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
@ -317,8 +336,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -349,7 +368,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n" ARGBTOARGB1555
"st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
@ -379,7 +398,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"movi v19.8b, #255 \n" ARGBTOARGB4444
@ -471,7 +490,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -494,7 +513,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -516,7 +535,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
@ -538,7 +557,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
@ -560,7 +579,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n" ARGBTORGB565
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
// pixels
@ -584,8 +603,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV21InterleavedTable]] \n"
"1: \n" READYUY2 YUVTORGB
RGBTORGB8
"1: \n" READYUY2 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -606,8 +624,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12InterleavedTable]] \n"
"1: \n" READUYVY YUVTORGB
RGBTORGB8
"1: \n" READUYVY NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"