aarch32 J420ToI420

benchmark on medium core
adbrun -- taskset 10 blaze-bin/third_party/libyuv/libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Now Neon
J420ToI420_Opt (159 ms)
Was C
J420ToI420_Opt (215 ms)

AArch64
J420ToI420_Opt (93 ms)

C version does this:
vld1.8	{d20, d21}, [r6]!
vorr	q12, q8, q8
subs	r4, #16
vmovl.u8	q11, d21
vmovl.u8	q10, d20
vmul.i16	q11, q9, q11
vmul.i16	q10, q9, q10
vsra.u16	q12, q11, #8
vorr	q11, q8, q8
vsra.u16	q11, q10, #8
vmovn.i16	d21, q12
vmovn.i16	d20, q11
vst1.8	{d20, d21}, [r5]!
bne	0x3d9078 <Convert8To8Row_C+0x36> @ imm = #-54

Explanation of above C code
vorr moves 16 into register
vsra does shift + accumulate to that register

Compared to aarch64
instead of mull, C uses movl+mul
instead of uzp2, C uses sra #8 + movn. takes 2 movn vs 1 uzp2
instead of add, C does vorr + sra

Change-Id: I9648f06e52ccbafaecf07bd89f8ffff27565d025
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6189497
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2025-01-22 04:11:54 -08:00
parent 26277baf96
commit 67f3f17d9a
3 changed files with 224 additions and 209 deletions

View File

@ -405,6 +405,7 @@ extern "C" {
#define HAS_BGRATOYROW_NEON
#define HAS_BYTETOFLOATROW_NEON
#define HAS_CONVERT16TO8ROW_NEON
#define HAS_CONVERT8TO8ROW_NEON
#define HAS_COPYROW_NEON
#define HAS_DETILEROW_16_NEON
#define HAS_DETILEROW_NEON
@ -507,7 +508,6 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_CONVERT8TO8ROW_NEON
#define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON

View File

@ -143,9 +143,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV444 YUVTORGB
RGBTORGB8
"subs %[width], %[width], #8 \n"
"1: \n" READYUV444
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -166,9 +165,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB
RGBTORGB8
"subs %[width], %[width], #8 \n"
"1: \n" READYUV444
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -190,9 +188,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
"subs %[width], %[width], #8 \n"
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -214,10 +211,9 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB
RGBTORGB8
"1: \n" READYUV444
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vld1.8 {d6}, [%[src_a]]! \n"
"subs %[width], %[width], #8 \n"
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -240,10 +236,9 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vld1.8 {d6}, [%[src_a]]! \n"
"subs %[width], %[width], #8 \n"
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -266,9 +261,9 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
"bgt 1b \n"
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
STORERGBA "bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
@ -288,9 +283,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
"subs %[width], %[width], #8 \n"
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -319,8 +313,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
ARGBTORGB565
"vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -350,9 +345,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
"subs %[width], %[width], #8 \n"
"1: \n" READYUV422
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
"vmov.u8 d6, #0xff \n" ARGBTOARGB1555
"vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555.
"bgt 1b \n"
@ -425,9 +419,9 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.8 {d20}, [%0]! \n"
"subs %2, %2, #8 \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@ -731,6 +725,7 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
"vld1.8 {q11}, [%0]! \n"
"vld1.8 {q13}, [%0]! \n"
"vld1.8 {q15}, [%0]! \n"
"subs %2, %2, #80 \n"
"vshl.u8 q8, q14, #6 \n" // Shift lower bit data
// appropriately.
"vshl.u8 q10, q14, #4 \n"
@ -753,7 +748,6 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
"vsri.u16 q15, q15, #10 \n"
"vstmia %1!, {q8-q15} \n" // Store pixel block (64
// pixels).
"subs %2, %2, #80 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@ -954,6 +948,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
"vld1.16 {d4}, [%2]! \n" // B
"vld1.16 {d2}, [%1]! \n" // G
"vld1.16 {d0}, [%0]! \n" // R
"subs %4, %4, #4 \n"
"vmovl.u16 q2, d4 \n" // B
"vmovl.u16 q1, d2 \n" // G
"vmovl.u16 q0, d0 \n" // R
@ -966,7 +961,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
"vsli.u32 q2, q1, #10 \n" // 00GB
"vsli.u32 q2, q0, #20 \n" // 0RGB
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
"subs %4, %4, #4 \n"
"vst1.8 {q2}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
@ -990,6 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
"vld1.16 {d4}, [%2]! \n" // B
"vld1.16 {d2}, [%1]! \n" // G
"vld1.16 {d0}, [%0]! \n" // R
"subs %4, %4, #4 \n"
"vmovl.u16 q2, d4 \n" // 000B
"vmovl.u16 q1, d2 \n" // G
"vmovl.u16 q0, d0 \n" // R
@ -999,7 +994,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
"vsli.u32 q2, q1, #10 \n" // 00GB
"vsli.u32 q2, q0, #20 \n" // 0RGB
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
"subs %4, %4, #4 \n"
"vst1.8 {q2}, [%3]! \n"
"bgt 1b \n"
"3: \n"
@ -1030,6 +1024,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vld1.16 {q3}, [%3]! \n" // A
"subs %5, %5, #8 \n"
"vmin.u16 q2, q2, q14 \n"
"vmin.u16 q1, q1, q14 \n"
"vmin.u16 q0, q0, q14 \n"
@ -1038,7 +1033,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"vshl.u16 q3, q3, q15 \n"
"subs %5, %5, #8 \n"
"vst4.16 {d0, d2, d4, d6}, [%4]! \n"
"vst4.16 {d1, d3, d5, d7}, [%4]! \n"
"bgt 1b \n"
@ -1070,13 +1064,13 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"subs %4, %4, #8 \n"
"vmin.u16 q2, q2, q14 \n"
"vmin.u16 q1, q1, q14 \n"
"vmin.u16 q0, q0, q14 \n"
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"subs %4, %4, #8 \n"
"vst4.16 {d0, d2, d4, d6}, [%3]! \n"
"vst4.16 {d1, d3, d5, d7}, [%3]! \n"
"bgt 1b \n"
@ -1106,6 +1100,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"vld1.16 {q3}, [%3]! \n" // A
"subs %5, %5, #8 \n"
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
@ -1114,7 +1109,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
"vqmovn.u16 d1, q1 \n"
"vqmovn.u16 d2, q2 \n"
"vqmovn.u16 d3, q3 \n"
"subs %5, %5, #8 \n"
"vst4.8 {d0, d1, d2, d3}, [%4]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
@ -1142,13 +1136,13 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
"vld1.16 {q2}, [%0]! \n" // R
"vld1.16 {q1}, [%1]! \n" // G
"vld1.16 {q0}, [%2]! \n" // B
"subs %4, %4, #8 \n"
"vshl.u16 q2, q2, q15 \n"
"vshl.u16 q1, q1, q15 \n"
"vshl.u16 q0, q0, q15 \n"
"vqmovn.u16 d5, q2 \n"
"vqmovn.u16 d4, q1 \n"
"vqmovn.u16 d3, q0 \n"
"subs %4, %4, #8 \n"
"vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_r), // %0
@ -1600,8 +1594,8 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
"vst1.8 {d1}, [%2]! \n" // store 8 U.
@ -1627,8 +1621,8 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
"vst1.8 {d0}, [%2]! \n" // store 8 U.
@ -1924,6 +1918,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
@ -1937,7 +1932,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -1970,6 +1964,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
@ -1983,7 +1978,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2015,6 +2009,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
@ -2028,7 +2023,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2061,6 +2055,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
@ -2074,7 +2069,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2107,6 +2101,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
@ -2120,7 +2115,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2152,6 +2146,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
@ -2165,7 +2160,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
"vrshr.u16 q2, q2, #1 \n"
"vrshr.u16 q3, q3, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q3, q2, q1)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2197,6 +2191,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
@ -2210,7 +2205,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2242,6 +2236,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
@ -2255,7 +2250,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2287,6 +2281,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
@ -2300,7 +2295,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2332,6 +2326,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
@ -2345,7 +2340,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2378,6 +2372,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
@ -2403,7 +2398,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"vrshr.u16 q5, q5, #1 \n"
"vrshr.u16 q6, q6, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
"vmul.s16 q8, q4, q10 \n" // B
"vmls.s16 q8, q5, q11 \n" // G
"vmls.s16 q8, q6, q12 \n" // R
@ -2444,6 +2438,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
@ -2469,7 +2464,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"vrshr.u16 q5, q5, #1 \n"
"vrshr.u16 q6, q6, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
"vmul.s16 q8, q4, q10 \n" // B
"vmls.s16 q8, q5, q11 \n" // G
"vmls.s16 q8, q6, q12 \n" // R
@ -2510,6 +2504,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
@ -2535,7 +2530,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"vrshr.u16 q1, q5, #1 \n"
"vrshr.u16 q2, q6, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -2633,9 +2627,9 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q2}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmov.u8 q1, q0 \n"
"vmov.u8 q3, q2 \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
"vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
"bgt 1b \n"
@ -2658,13 +2652,13 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q2}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vtbl.8 d2, {d0, d1}, d8 \n"
"vtbl.8 d3, {d0, d1}, d9 \n"
"vtbl.8 d6, {d4, d5}, d8 \n"
"vtbl.8 d7, {d4, d5}, d9 \n"
"vmov.u8 q0, q1 \n"
"vmov.u8 q2, q3 \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
"vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
"bgt 1b \n"
@ -2684,11 +2678,11 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
"vld1.16 {q1}, [%0]! \n"
"vld1.16 {q2}, [%0]! \n"
"vld1.16 {q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vshrn.u16 d0, q0, #8 \n"
"vshrn.u16 d1, q1, #8 \n"
"vshrn.u16 d4, q2, #8 \n"
"vshrn.u16 d5, q3, #8 \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst1.8 {q0}, [%1]! \n" // store 4 pixels
"vst1.8 {q2}, [%1]! \n" // store 4 pixels
"bgt 1b \n"
@ -2712,11 +2706,11 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
"vld1.16 {q1}, [%0]! \n"
"vld1.16 {q2}, [%0]! \n"
"vld1.16 {q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vtbl.8 d0, {d0, d1}, d8 \n"
"vtbl.8 d1, {d2, d3}, d8 \n"
"vtbl.8 d4, {d4, d5}, d8 \n"
"vtbl.8 d5, {d6, d7}, d8 \n"
"subs %2, %2, #8 \n" // 8 processed per loop.
"vst1.8 {q0}, [%1]! \n" // store 4 pixels
"vst1.8 {q2}, [%1]! \n" // store 4 pixels
"bgt 1b \n"
@ -3472,6 +3466,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q0, d0, d1 \n"
"vld1.8 {d2}, [%1],%5 \n" // center * 2
"vld1.8 {d3}, [%1],%6 \n"
@ -3480,7 +3475,6 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"vadd.s16 q0, q0, q1 \n"
"vld1.8 {d2}, [%2],%5 \n" // bottom
"vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
@ -3510,6 +3504,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q0, d0, d1 \n"
"vld1.8 {d2}, [%0],%4 \n" // center * 2
"vld1.8 {d3}, [%1],%4 \n"
@ -3518,7 +3513,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"vadd.s16 q0, q0, q1 \n"
"vld1.8 {d2}, [%0],%5 \n" // right
"vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
@ -3613,6 +3607,7 @@ void GaussCol_NEON(const uint16_t* src0,
"1: \n"
"vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
"vld1.16 {q2}, [%4]! \n"
"subs %6, %6, #8 \n" // 8 processed per loop
"vaddl.u16 q0, d2, d4 \n" // * 1
"vaddl.u16 q1, d3, d5 \n" // * 1
"vld1.16 {q2}, [%1]! \n"
@ -3624,7 +3619,6 @@ void GaussCol_NEON(const uint16_t* src0,
"vld1.16 {q2}, [%3]! \n"
"vmlal.u16 q0, d4, d6 \n" // * 4
"vmlal.u16 q1, d5, d6 \n" // * 4
"subs %6, %6, #8 \n" // 8 processed per loop
"vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
"bgt 1b \n"
: "+r"(src0), // %0
@ -3650,6 +3644,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"1: \n"
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
"vld1.32 {q2}, [%0] \n"
"subs %5, %5, #8 \n" // 8 processed per loop
"vadd.u32 q0, q0, q1 \n" // * 1
"vadd.u32 q1, q1, q2 \n" // * 1
"vld1.32 {q2, q3}, [%2]! \n"
@ -3661,7 +3656,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"vadd.u32 q3, q3, q9 \n"
"vmla.u32 q0, q2, q10 \n" // * 4
"vmla.u32 q1, q3, q10 \n" // * 4
"subs %5, %5, #8 \n" // 8 processed per loop
"vqshrn.u32 d0, q0, #8 \n" // round and pack
"vqshrn.u32 d1, q1, #8 \n"
"vst1.u16 {q0}, [%4]! \n" // store 8 samples
@ -3685,11 +3679,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
"subs %3, %3, #16 \n" // 16 pixels per loop
"vmov d1, d0 \n"
"vzip.u8 d0, d1 \n" // VV
"vmov d3, d2 \n"
"vzip.u8 d2, d3 \n" // UU
"subs %3, %3, #16 \n" // 16 pixels per loop
"vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
"vst3.8 {d1, d3, d5}, [%2]! \n"
"bgt 1b \n"
@ -3711,6 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
"subs %3, %3, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
@ -3721,7 +3716,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d0, q1, #2 \n"
"subs %3, %3, #16 \n" // 16 processed per loop.
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
"bgt 1b \n"
: "+r"(src_ayuv), // %0
@ -3742,6 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
"subs %3, %3, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
@ -3752,7 +3747,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d1, q1, #2 \n"
"subs %3, %3, #16 \n" // 16 processed per loop.
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
"bgt 1b \n"
: "+r"(src_ayuv), // %0
@ -3786,8 +3780,8 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
"vld2.8 {d1, d3}, [%0]! \n"
"vmov.u8 q2, q0 \n" // move U after V
"subs %2, %2, #16 \n" // 16 pixels per loop
"vmov.u8 q2, q0 \n" // move U after V
"vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
"bgt 1b \n"
: "+r"(src_uv), // %0
@ -3811,13 +3805,13 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
"vld1.8 {q1}, [%2]! \n" // load 16 V values
"vld1.8 {q2}, [%1]! \n"
"vld1.8 {q3}, [%3]! \n"
"subs %5, %5, #16 \n" // 16 src pixels per loop
"vpaddl.u8 q0, q0 \n" // half size
"vpaddl.u8 q1, q1 \n"
"vpadal.u8 q0, q2 \n"
"vpadal.u8 q1, q3 \n"
"vqrshrn.u16 d0, q0, #2 \n"
"vqrshrn.u16 d1, q1, #2 \n"
"subs %5, %5, #16 \n" // 16 src pixels per loop
"vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
"bgt 1b \n"
: "+r"(src_u), // %0
@ -3840,9 +3834,9 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
"vdup.16 q2, %4 \n"
"1: \n"
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vst1.16 {q0}, [%1]! \n" // store 8 U pixels
"vst1.16 {q1}, [%2]! \n" // store 8 V pixels
"bgt 1b \n"
@ -3865,9 +3859,9 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
"1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U
"vld1.16 {q1}, [%1]! \n" // load 8 V
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
"bgt 1b \n"
: "+r"(src_u), // %0
@ -3887,11 +3881,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"vmul.u16 q0, q0, q2 \n"
"vmul.u16 q1, q1, q2 \n"
"vst1.16 {q0}, [%1]! \n"
"vst1.16 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
@ -3908,6 +3902,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"vdup.16 d8, %3 \n"
"1: \n"
"vld1.16 {q2, q3}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"vmull.u16 q0, d4, d8 \n"
"vmull.u16 q1, d5, d8 \n"
"vmull.u16 q2, d6, d8 \n"
@ -3917,7 +3912,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"vshrn.u32 d2, q2, #16 \n"
"vshrn.u32 d3, q3, #16 \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
@ -3941,11 +3935,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"vshl.u16 q0, q0, q2 \n" // shr = q2 is negative
"vshl.u16 q1, q1, q2 \n"
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d1, q1 \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@ -3955,6 +3949,41 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
: "cc", "memory", "q0", "q1", "q2");
}
// Use scale to convert J420 to I420
// scale parameter is 8.8 fixed point but limited to 0 to 255
// Function is based on DivideRow, but adds a bias
// Does not clamp
void Convert8To8Row_NEON(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
asm volatile(
"vdup.8 d8, %3 \n"
"vdup.8 q5, %4 \n"
"1: \n"
"vld1.8 {q2, q3}, [%0]! \n"
"subs %2, %2, #32 \n" // 32 src pixels per loop
"vmull.u8 q0, d4, d8 \n"
"vmull.u8 q1, d5, d8 \n"
"vmull.u8 q2, d6, d8 \n"
"vmull.u8 q3, d7, d8 \n"
"vshrn.u16 d0, q0, #8 \n"
"vshrn.u16 d1, q1, #8 \n"
"vshrn.u16 d2, q2, #8 \n"
"vshrn.u16 d3, q3, #8 \n"
"vadd.u8 q0, q0, q5 \n"
"vadd.u8 q1, q1, q5 \n"
"vst1.8 {q0, q1}, [%1]! \n" // store 32 pixels
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale), // %3
"r"(bias) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "d8", "q5");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
#ifdef __cplusplus

View File

@ -242,9 +242,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV444
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -265,9 +264,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV444
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -293,8 +291,8 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUV210
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
@ -321,8 +319,8 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUV410
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
@ -348,8 +346,8 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV212 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUV212
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
@ -371,9 +369,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV210
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -395,9 +392,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV410
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -421,9 +417,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV212 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV212
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -445,9 +440,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n" /* A */
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -475,8 +469,8 @@ void P210ToARGBRow_NEON(const uint16_t* src_y,
"movi v19.8b, #255 \n"
"ldr q2, [%[kIndices]] \n"
"1: \n" //
READYUVP210 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
READYUVP210
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -504,8 +498,8 @@ void P410ToARGBRow_NEON(const uint16_t* src_y,
"movi v19.8b, #255 \n"
"ldr q2, [%[kIndices]] \n"
"1: \n" //
READYUVP410 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
READYUVP410
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -526,23 +520,22 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUVP210
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void P410ToAR30Row_NEON(const uint16_t* src_y,
@ -553,23 +546,22 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUVP410
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I422ToAR30Row_NEON(const uint8_t* src_y,
@ -585,8 +577,8 @@ void I422ToAR30Row_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV422 I4XXTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
@ -610,8 +602,8 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -636,8 +628,8 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -662,8 +654,8 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -688,8 +680,8 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -712,9 +704,8 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v15.8b, #255 \n" /* A */
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -735,9 +726,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -777,9 +767,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP
ARGBTORGB565_FROM_TOP
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -818,10 +808,9 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8h, #0x80, lsl #8 \n"
"1: \n" //
READYUV422 I4XXTORGB RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" //
ARGBTOARGB1555_FROM_TOP
"1: \n" //
READYUV422 "subs %w[width], %w[width], #8 \n" //
I4XXTORGB RGBTORGB8_TOP ARGBTOARGB1555_FROM_TOP
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
// RGB1555.
"b.gt 1b \n"
@ -849,9 +838,8 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 I4XXTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUV422
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
"movi v19.8b, #255 \n" ARGBTOARGB4444
"st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
// pixels
@ -880,8 +868,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
"umull v4.8h, v1.8b, v28.8b \n" /* DB */
"umull2 v5.8h, v1.16b, v29.16b \n" /* DR */
"1: \n" READYUV400 I400TORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"subs %w[width], %w[width], #8 \n" RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -898,10 +885,10 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
"movi v23.8b, #255 \n"
"1: \n"
"ld1 {v20.8b}, [%0], #8 \n"
"subs %w2, %w2, #8 \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v21.8b, v20.8b \n"
"mov v22.8b, v20.8b \n"
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -941,8 +928,8 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READNV12
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -964,8 +951,8 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READNV12
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -986,8 +973,8 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READNV12
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -1008,8 +995,8 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READNV12
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
@ -1030,9 +1017,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"ldr q2, [%[kNV12Table]] \n"
"1: \n" READNV12 NVTORGB
RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
"1: \n" READNV12
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP
ARGBTORGB565_FROM_TOP
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
// pixels
// RGB565.
@ -1055,8 +1042,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV21InterleavedTable]] \n"
"1: \n" READYUY2 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READYUY2
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
@ -1076,8 +1063,8 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"ldr q2, [%[kNV12InterleavedTable]] \n"
"1: \n" READUYVY NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"1: \n" READUYVY
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
@ -1188,10 +1175,10 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
asm volatile(
"1: \n"
"ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
"subs %w3, %w3, #16 \n" // store 8 YUY2
"prfm pldl1keep, [%0, 1792] \n"
"ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
"prfm pldl1keep, [%1, 1792] \n"
"subs %w3, %w3, #16 \n" // store 8 YUY2
"st2 {v0.16b,v1.16b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -1240,10 +1227,10 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
"1: \n"
"ld1 {v7.16b}, [%0], #16 \n"
"ld1 {v0.16b-v3.16b}, [%0], #64 \n"
"subs %2, %2, #80 \n"
"shl v4.16b, v7.16b, #6 \n"
"shl v5.16b, v7.16b, #4 \n"
"shl v6.16b, v7.16b, #2 \n"
"subs %2, %2, #80 \n"
"zip1 v16.16b, v4.16b, v0.16b \n"
"zip1 v18.16b, v5.16b, v1.16b \n"
"zip1 v20.16b, v6.16b, v2.16b \n"
@ -1305,8 +1292,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
"dup v2.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ushl v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"ushl v1.8h, v1.8h, v2.8h \n"
@ -1356,8 +1343,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
"dup v4.8h, %w4 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
"ushl v0.8h, v0.8h, v4.8h \n"
"ushl v1.8h, v1.8h, v4.8h \n"
"prfm pldl1keep, [%0, 448] \n"
@ -1596,6 +1583,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
"ldr d2, [%2], #8 \n" // B
"ldr d1, [%1], #8 \n" // G
"ldr d0, [%0], #8 \n" // R
"subs %w4, %w4, #4 \n"
"ushll v2.4s, v2.4h, #0 \n" // B
"ushll v1.4s, v1.4h, #0 \n" // G
"ushll v0.4s, v0.4h, #0 \n" // R
@ -1608,7 +1596,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
"sli v2.4s, v1.4s, #10 \n" // 00GB
"sli v2.4s, v0.4s, #20 \n" // 0RGB
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
"subs %w4, %w4, #4 \n"
"str q2, [%3], #16 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
@ -1637,6 +1624,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
"ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr
"ldr q1, [%1], #16 \n" // xxxxxxGggggggggg
"ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb
"subs %w4, %w4, #8 \n"
"umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr
"umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg
"movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000
@ -1644,7 +1632,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
"mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000
"mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb
"usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg
"subs %w4, %w4, #8 \n"
"st2 {v3.8h, v4.8h}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
@ -1674,6 +1661,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"ldr q3, [%3], #16 \n" // A
"subs %w5, %w5, #8 \n"
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
@ -1686,7 +1674,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"ushl v3.8h, v3.8h, v31.8h \n"
"subs %w5, %w5, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
@ -1718,6 +1705,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
"ldr q2, [%0], #16 \n" // R
"ldr q1, [%1], #16 \n" // G
"ldr q0, [%2], #16 \n" // B
"subs %w4, %w4, #8 \n"
"umin v2.8h, v2.8h, v30.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"umin v1.8h, v1.8h, v30.8h \n"
@ -1727,7 +1715,6 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
"ushl v2.8h, v2.8h, v31.8h \n"
"ushl v1.8h, v1.8h, v31.8h \n"
"ushl v0.8h, v0.8h, v31.8h \n"
"subs %w4, %w4, #8 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
"b.gt 1b \n"
: "+r"(src_r), // %0
@ -1756,6 +1743,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
"ldr q1, [%1], #16 \n" // G
"ldr q2, [%2], #16 \n" // R
"ldr q3, [%3], #16 \n" // A
"subs %w5, %w5, #8 \n"
"uqshl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uqshl v1.8h, v1.8h, v31.8h \n"
@ -1766,7 +1754,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
"prfm pldl1keep, [%3, 448] \n"
"trn2 v0.16b, v0.16b, v1.16b \n"
"trn2 v1.16b, v2.16b, v3.16b \n"
"subs %w5, %w5, #8 \n"
"st2 {v0.8h, v1.8h}, [%4], #32 \n"
"b.gt 1b \n"
: "+r"(src_b), // %0
@ -1794,6 +1781,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
"ldr q0, [%0], #16 \n" // B
"ldr q1, [%1], #16 \n" // G
"ldr q2, [%2], #16 \n" // R
"subs %w4, %w4, #8 \n"
"uqshl v0.8h, v0.8h, v31.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uqshl v1.8h, v1.8h, v31.8h \n"
@ -1802,7 +1790,6 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
"prfm pldl1keep, [%2, 448] \n"
"trn2 v0.16b, v0.16b, v1.16b \n"
"trn2 v1.16b, v2.16b, v3.16b \n"
"subs %w4, %w4, #8 \n"
"st2 {v0.8h, v1.8h}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_b), // %0
@ -1994,8 +1981,8 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"1: \n"
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
// RGB24.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
@ -2471,11 +2458,11 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
asm volatile(
"1: \n"
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
"subs %w4, %w4, #16 \n" // 16 pixels
"mov v3.8b, v2.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
"subs %w4, %w4, #16 \n" // 16 pixels
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -2569,10 +2556,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"mov v1.16b, v0.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v3.16b, v2.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
"b.gt 1b \n"
@ -2593,12 +2580,12 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
"ldr q4, [%3] \n" // shuffler
"1: \n"
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"tbl v0.16b, {v0.16b}, v4.16b \n"
"tbl v2.16b, {v2.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"mov v1.16b, v0.16b \n"
"mov v3.16b, v2.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
"b.gt 1b \n"
@ -2668,10 +2655,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_ar64), // %0
@ -2692,10 +2679,10 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"stp q0, q2, [%1], #32 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_ab64), // %0
@ -2786,6 +2773,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
"movi v29.16b, #0x80 \n" // 128.5
"1: \n"
"ldp q0, q1, [%[src]], #32 \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
"movi v2.4s, #0 \n"
"movi v3.4s, #0 \n"
"movi v4.4s, #0 \n"
@ -2795,7 +2783,6 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
"usdot v4.4s, v0.16b, v17.16b \n"
"usdot v5.4s, v1.16b, v17.16b \n"
"prfm pldl1keep, [%[src], 448] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
"uzp1 v0.8h, v2.8h, v3.8h \n"
"uzp1 v1.8h, v4.8h, v5.8h \n"
"addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned
@ -2877,6 +2864,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -2892,7 +2880,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2924,6 +2911,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -2938,7 +2926,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2969,6 +2956,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -2983,7 +2971,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3014,6 +3001,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -3028,7 +3016,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3059,6 +3046,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -3073,7 +3061,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3099,6 +3086,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
@ -3113,7 +3101,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
"urshr v1.8h, v3.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3139,6 +3126,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -3153,7 +3141,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
"urshr v2.8h, v2.8h, #1 \n"
"urshr v1.8h, v1.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3179,6 +3166,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
@ -3193,7 +3181,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3219,6 +3206,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
RGBTOUV_SETUP_REG
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -3233,7 +3221,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3259,6 +3246,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
RGBTOUV_SETUP_REG
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -3273,7 +3261,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v0.8h, v0.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3300,6 +3287,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
RGBTOUV_SETUP_REG
"1: \n"
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGB565TOARGB
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
@ -3317,7 +3305,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
"urshr v1.8h, v17.8h, #1 \n"
"urshr v2.8h, v18.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3344,6 +3331,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
RGBTOUV_SETUP_REG
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGB555TOARGB
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
@ -3361,7 +3349,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
"urshr v1.8h, v17.8h, #1 \n"
"urshr v2.8h, v18.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -3388,6 +3375,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
RGBTOUV_SETUP_REG // sets v20-v25
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
ARGB4444TORGB
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
@ -3405,7 +3393,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"urshr v1.8h, v17.8h, #1 \n"
"urshr v2.8h, v18.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -4210,9 +4197,9 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
"ldr q25, [%[indices]] \n"
"1: \n"
"ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"movi v0.4s, #0 \n"
"movi v2.4s, #0 \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"udot v0.4s, v1.16b, v24.16b \n"
"udot v2.4s, v3.16b, v24.16b \n"
"prfm pldl1keep, [%[src], 448] \n"
@ -4281,6 +4268,7 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
"ldr d23, [%[indices]] \n"
"1: \n"
"ldp q0, q1, [%[dst]] \n"
"subs %w1, %w1, #8 \n"
"movi v2.4s, #0 \n"
"movi v3.4s, #0 \n"
"movi v4.4s, #0 \n"
@ -4293,7 +4281,6 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
"udot v5.4s, v1.16b, v21.16b \n"
"udot v6.4s, v0.16b, v22.16b \n"
"udot v7.4s, v1.16b, v22.16b \n"
"subs %w1, %w1, #8 \n"
"prfm pldl1keep, [%[dst], 448] \n"
"uzp1 v6.8h, v6.8h, v7.8h \n"
"uzp1 v5.8h, v4.8h, v5.8h \n"
@ -4383,6 +4370,7 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
"1: \n"
"ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"movi v16.4s, #0 \n"
"movi v17.4s, #0 \n"
@ -4393,8 +4381,6 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
"movi v22.4s, #0 \n"
"movi v23.4s, #0 \n"
// 8 processed per loop.
"subs %w2, %w2, #8 \n"
"prfm pldl1keep, [%[src_argb], 448] \n"
"sudot v16.4s, v31.16b, v0.4b[0] \n"
@ -4609,6 +4595,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"1: \n"
"ld1 {v0.8b}, [%0],%5 \n" // top
"ld1 {v1.8b}, [%0],%6 \n"
"subs %w4, %w4, #8 \n" // 8 pixels
"usubl v0.8h, v0.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"ld1 {v2.8b}, [%1],%5 \n" // center * 2
@ -4619,7 +4606,6 @@ void SobelXRow_NEON(const uint8_t* src_y0,
"add v0.8h, v0.8h, v1.8h \n"
"ld1 {v2.8b}, [%2],%5 \n" // bottom
"ld1 {v3.8b}, [%2],%6 \n"
"subs %w4, %w4, #8 \n" // 8 pixels
"prfm pldl1keep, [%2, 448] \n"
"usubl v1.8h, v2.8b, v3.8b \n"
"add v0.8h, v0.8h, v1.8h \n"
@ -4650,6 +4636,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"1: \n"
"ld1 {v0.8b}, [%0],%4 \n" // left
"ld1 {v1.8b}, [%1],%4 \n"
"subs %w3, %w3, #8 \n" // 8 pixels
"usubl v0.8h, v0.8b, v1.8b \n"
"ld1 {v2.8b}, [%0],%4 \n" // center * 2
"ld1 {v3.8b}, [%1],%4 \n"
@ -4658,7 +4645,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
"add v0.8h, v0.8h, v1.8h \n"
"ld1 {v2.8b}, [%0],%5 \n" // right
"ld1 {v3.8b}, [%1],%5 \n"
"subs %w3, %w3, #8 \n" // 8 pixels
"usubl v1.8h, v2.8b, v3.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"add v0.8h, v0.8h, v1.8h \n"
@ -4881,8 +4867,8 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
@ -4909,6 +4895,7 @@ void GaussCol_NEON(const uint16_t* src0,
"1: \n"
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
"ld1 {v2.8h}, [%4], #16 \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
"prfm pldl1keep, [%0, 448] \n"
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
@ -4924,7 +4911,6 @@ void GaussCol_NEON(const uint16_t* src0,
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
"prfm pldl1keep, [%3, 448] \n"
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
"prfm pldl1keep, [%4, 448] \n"
"b.gt 1b \n"
@ -4950,6 +4936,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"1: \n"
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
"subs %w5, %w5, #8 \n" // 8 processed per loop
"add v0.4s, v0.4s, v1.4s \n" // * 1
"add v1.4s, v1.4s, v2.4s \n" // * 1
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
@ -4962,7 +4949,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
"prfm pldl1keep, [%0, 448] \n"
"mla v0.4s, v2.4s, v6.4s \n" // * 4
"mla v1.4s, v3.4s, v6.4s \n" // * 4
"subs %w5, %w5, #8 \n" // 8 processed per loop
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
"uqrshrn2 v0.8h, v1.4s, #8 \n"
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
@ -4993,6 +4979,7 @@ void GaussCol_F32_NEON(const float* src0,
"1: \n"
"ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
"ld1 {v2.4s, v3.4s}, [%1], #32 \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
"fmla v1.4s, v3.4s, v6.4s \n"
@ -5009,7 +4996,6 @@ void GaussCol_F32_NEON(const float* src0,
"prfm pldl1keep, [%3, 448] \n"
"fadd v1.4s, v1.4s, v5.4s \n"
"prfm pldl1keep, [%4, 448] \n"
"subs %w6, %w6, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src0), // %0
@ -5031,6 +5017,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
"1: \n"
"ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
// rows
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fadd v0.4s, v0.4s, v1.4s \n" // * 1
"ld1 {v4.4s, v5.4s}, [%0], %5 \n"
"fadd v1.4s, v1.4s, v2.4s \n"
@ -5045,7 +5032,6 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
"prfm pldl1keep, [%0, 448] \n"
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
"fmul v1.4s, v1.4s, v8.4s \n"
"subs %w2, %w2, #8 \n" // 8 processed per loop
"st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
@ -5068,11 +5054,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
"prfm pldl1keep, [%0, 448] \n"
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -5100,12 +5086,12 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
"prfm pldl1keep, [%0, 448] \n"
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
"prfm pldl1keep, [%1, 448] \n"
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -5129,6 +5115,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
@ -5138,7 +5125,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
"prfm pldl1keep, [%1, 448] \n"
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v2.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
@ -5158,6 +5144,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
@ -5167,7 +5154,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
"prfm pldl1keep, [%1, 448] \n"
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v1.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
@ -5227,6 +5213,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
"ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
"ld1 {v2.16b}, [%1], #16 \n"
"ld1 {v3.16b}, [%3], #16 \n"
"subs %w5, %w5, #16 \n" // 16 src pixels per loop
"uaddlp v0.8h, v0.16b \n" // half size
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n"
@ -5237,7 +5224,6 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
"prfm pldl1keep, [%3, 448] \n"
"uqrshrn v0.8b, v0.8h, #2 \n"
"uqrshrn v1.8b, v1.8h, #2 \n"
"subs %w5, %w5, #16 \n" // 16 src pixels per loop
"st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
"b.gt 1b \n"
: "+r"(src_u), // %0
@ -5376,7 +5362,7 @@ void Convert8To8Row_NEON(const uint8_t* src_y,
"uzp2 v1.16b, v2.16b, v3.16b \n"
"add v0.16b, v0.16b, v5.16b \n" // add bias (16)
"add v1.16b, v1.16b, v5.16b \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"stp q0, q1, [%1], #32 \n" // store 32 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1