mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
aarch32 J420ToI420
benchmark on medium core
adbrun -- taskset 10 blaze-bin/third_party/libyuv/libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
Now Neon
J420ToI420_Opt (159 ms)
Was C
J420ToI420_Opt (215 ms)
AArch64
J420ToI420_Opt (93 ms)
C version does this:
vld1.8 {d20, d21}, [r6]!
vorr q12, q8, q8
subs r4, #16
vmovl.u8 q11, d21
vmovl.u8 q10, d20
vmul.i16 q11, q9, q11
vmul.i16 q10, q9, q10
vsra.u16 q12, q11, #8
vorr q11, q8, q8
vsra.u16 q11, q10, #8
vmovn.i16 d21, q12
vmovn.i16 d20, q11
vst1.8 {d20, d21}, [r5]!
bne 0x3d9078 <Convert8To8Row_C+0x36> @ imm = #-54
Explanation of above C code
vorr moves 16 into register
vsra does shift + accumulate to that register
Compared to aarch64
instead of mull, C uses movl+mul
instead of uzp2, C uses sra #8 + movn. takes 2 movn vs 1 uzp2
instead of add, C does vorr + sra
Change-Id: I9648f06e52ccbafaecf07bd89f8ffff27565d025
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6189497
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
26277baf96
commit
67f3f17d9a
@ -405,6 +405,7 @@ extern "C" {
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_BYTETOFLOATROW_NEON
|
||||
#define HAS_CONVERT16TO8ROW_NEON
|
||||
#define HAS_CONVERT8TO8ROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_DETILEROW_16_NEON
|
||||
#define HAS_DETILEROW_NEON
|
||||
@ -507,7 +508,6 @@ extern "C" {
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_CONVERT8TO8ROW_NEON
|
||||
#define HAS_ARGBTOAR30ROW_NEON
|
||||
#define HAS_ABGRTOAR30ROW_NEON
|
||||
#define HAS_I210ALPHATOARGBROW_NEON
|
||||
|
||||
@ -143,9 +143,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"1: \n" READYUV444
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -166,9 +165,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"1: \n" READYUV444
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -190,9 +188,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -214,10 +211,9 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
RGBTORGB8
|
||||
"1: \n" READYUV444
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vld1.8 {d6}, [%[src_a]]! \n"
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -240,10 +236,9 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vld1.8 {d6}, [%[src_a]]! \n"
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -266,9 +261,9 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
|
||||
"bgt 1b \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
STORERGBA "bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
[src_v] "+r"(src_v), // %[src_v]
|
||||
@ -288,9 +283,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -319,8 +313,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
ARGBTORGB565
|
||||
"vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
|
||||
"bgt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -350,9 +345,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
"subs %[width], %[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8
|
||||
"vmov.u8 d6, #0xff \n" ARGBTOARGB1555
|
||||
"vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555.
|
||||
"bgt 1b \n"
|
||||
@ -425,9 +419,9 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d20}, [%0]! \n"
|
||||
"subs %2, %2, #8 \n"
|
||||
"vmov d21, d20 \n"
|
||||
"vmov d22, d20 \n"
|
||||
"subs %2, %2, #8 \n"
|
||||
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -731,6 +725,7 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
"vld1.8 {q11}, [%0]! \n"
|
||||
"vld1.8 {q13}, [%0]! \n"
|
||||
"vld1.8 {q15}, [%0]! \n"
|
||||
"subs %2, %2, #80 \n"
|
||||
"vshl.u8 q8, q14, #6 \n" // Shift lower bit data
|
||||
// appropriately.
|
||||
"vshl.u8 q10, q14, #4 \n"
|
||||
@ -753,7 +748,6 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
"vsri.u16 q15, q15, #10 \n"
|
||||
"vstmia %1!, {q8-q15} \n" // Store pixel block (64
|
||||
// pixels).
|
||||
"subs %2, %2, #80 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -954,6 +948,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {d4}, [%2]! \n" // B
|
||||
"vld1.16 {d2}, [%1]! \n" // G
|
||||
"vld1.16 {d0}, [%0]! \n" // R
|
||||
"subs %4, %4, #4 \n"
|
||||
"vmovl.u16 q2, d4 \n" // B
|
||||
"vmovl.u16 q1, d2 \n" // G
|
||||
"vmovl.u16 q0, d0 \n" // R
|
||||
@ -966,7 +961,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
"vsli.u32 q2, q1, #10 \n" // 00GB
|
||||
"vsli.u32 q2, q0, #20 \n" // 0RGB
|
||||
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
|
||||
"subs %4, %4, #4 \n"
|
||||
"vst1.8 {q2}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -990,6 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {d4}, [%2]! \n" // B
|
||||
"vld1.16 {d2}, [%1]! \n" // G
|
||||
"vld1.16 {d0}, [%0]! \n" // R
|
||||
"subs %4, %4, #4 \n"
|
||||
"vmovl.u16 q2, d4 \n" // 000B
|
||||
"vmovl.u16 q1, d2 \n" // G
|
||||
"vmovl.u16 q0, d0 \n" // R
|
||||
@ -999,7 +994,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
"vsli.u32 q2, q1, #10 \n" // 00GB
|
||||
"vsli.u32 q2, q0, #20 \n" // 0RGB
|
||||
"vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
|
||||
"subs %4, %4, #4 \n"
|
||||
"vst1.8 {q2}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
"3: \n"
|
||||
@ -1030,6 +1024,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vld1.16 {q3}, [%3]! \n" // A
|
||||
"subs %5, %5, #8 \n"
|
||||
"vmin.u16 q2, q2, q14 \n"
|
||||
"vmin.u16 q1, q1, q14 \n"
|
||||
"vmin.u16 q0, q0, q14 \n"
|
||||
@ -1038,7 +1033,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"vshl.u16 q3, q3, q15 \n"
|
||||
"subs %5, %5, #8 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%4]! \n"
|
||||
"vst4.16 {d1, d3, d5, d7}, [%4]! \n"
|
||||
"bgt 1b \n"
|
||||
@ -1070,13 +1064,13 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"subs %4, %4, #8 \n"
|
||||
"vmin.u16 q2, q2, q14 \n"
|
||||
"vmin.u16 q1, q1, q14 \n"
|
||||
"vmin.u16 q0, q0, q14 \n"
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%3]! \n"
|
||||
"vst4.16 {d1, d3, d5, d7}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
@ -1106,6 +1100,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"vld1.16 {q3}, [%3]! \n" // A
|
||||
"subs %5, %5, #8 \n"
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
@ -1114,7 +1109,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"vqmovn.u16 d1, q1 \n"
|
||||
"vqmovn.u16 d2, q2 \n"
|
||||
"vqmovn.u16 d3, q3 \n"
|
||||
"subs %5, %5, #8 \n"
|
||||
"vst4.8 {d0, d1, d2, d3}, [%4]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1142,13 +1136,13 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"vld1.16 {q2}, [%0]! \n" // R
|
||||
"vld1.16 {q1}, [%1]! \n" // G
|
||||
"vld1.16 {q0}, [%2]! \n" // B
|
||||
"subs %4, %4, #8 \n"
|
||||
"vshl.u16 q2, q2, q15 \n"
|
||||
"vshl.u16 q1, q1, q15 \n"
|
||||
"vshl.u16 q0, q0, q15 \n"
|
||||
"vqmovn.u16 d5, q2 \n"
|
||||
"vqmovn.u16 d4, q1 \n"
|
||||
"vqmovn.u16 d3, q0 \n"
|
||||
"subs %4, %4, #8 \n"
|
||||
"vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1600,8 +1594,8 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
|
||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
|
||||
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
|
||||
"vst1.8 {d1}, [%2]! \n" // store 8 U.
|
||||
@ -1627,8 +1621,8 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
||||
"add %1, %0, %1 \n" // stride + src_uyvy
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
|
||||
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 U.
|
||||
@ -1924,6 +1918,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -1937,7 +1932,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -1970,6 +1964,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -1983,7 +1978,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2015,6 +2009,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
@ -2028,7 +2023,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q2, q1, q0)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2061,6 +2055,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2074,7 +2069,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2107,6 +2101,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2120,7 +2115,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q2, q1, q0)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2152,6 +2146,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2165,7 +2160,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
"vrshr.u16 q3, q3, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q3, q2, q1)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2197,6 +2191,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2210,7 +2205,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q2, q1, q0)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2242,6 +2236,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2255,7 +2250,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2287,6 +2281,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2300,7 +2295,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2332,6 +2326,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
|
||||
@ -2345,7 +2340,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
|
||||
"vrshr.u16 q1, q1, #1 \n"
|
||||
"vrshr.u16 q2, q2, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q2, q1, q0)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2378,6 +2372,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGB565TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
|
||||
@ -2403,7 +2398,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
"vrshr.u16 q5, q5, #1 \n"
|
||||
"vrshr.u16 q6, q6, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vmul.s16 q8, q4, q10 \n" // B
|
||||
"vmls.s16 q8, q5, q11 \n" // G
|
||||
"vmls.s16 q8, q6, q12 \n" // R
|
||||
@ -2444,6 +2438,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGB555TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
|
||||
@ -2469,7 +2464,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
"vrshr.u16 q5, q5, #1 \n"
|
||||
"vrshr.u16 q6, q6, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vmul.s16 q8, q4, q10 \n" // B
|
||||
"vmls.s16 q8, q5, q11 \n" // G
|
||||
"vmls.s16 q8, q6, q12 \n" // R
|
||||
@ -2510,6 +2504,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
"vmov.u16 q15, #0x8080 \n" // 128.5
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
|
||||
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
|
||||
@ -2535,7 +2530,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
"vrshr.u16 q1, q5, #1 \n"
|
||||
"vrshr.u16 q2, q6, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -2633,9 +2627,9 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q2}, [%0]! \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmov.u8 q1, q0 \n"
|
||||
"vmov.u8 q3, q2 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
|
||||
"vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
|
||||
"bgt 1b \n"
|
||||
@ -2658,13 +2652,13 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q2}, [%0]! \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vtbl.8 d2, {d0, d1}, d8 \n"
|
||||
"vtbl.8 d3, {d0, d1}, d9 \n"
|
||||
"vtbl.8 d6, {d4, d5}, d8 \n"
|
||||
"vtbl.8 d7, {d4, d5}, d9 \n"
|
||||
"vmov.u8 q0, q1 \n"
|
||||
"vmov.u8 q2, q3 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
|
||||
"vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
|
||||
"bgt 1b \n"
|
||||
@ -2684,11 +2678,11 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vld1.16 {q2}, [%0]! \n"
|
||||
"vld1.16 {q3}, [%0]! \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vshrn.u16 d0, q0, #8 \n"
|
||||
"vshrn.u16 d1, q1, #8 \n"
|
||||
"vshrn.u16 d4, q2, #8 \n"
|
||||
"vshrn.u16 d5, q3, #8 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst1.8 {q0}, [%1]! \n" // store 4 pixels
|
||||
"vst1.8 {q2}, [%1]! \n" // store 4 pixels
|
||||
"bgt 1b \n"
|
||||
@ -2712,11 +2706,11 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vld1.16 {q2}, [%0]! \n"
|
||||
"vld1.16 {q3}, [%0]! \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vtbl.8 d0, {d0, d1}, d8 \n"
|
||||
"vtbl.8 d1, {d2, d3}, d8 \n"
|
||||
"vtbl.8 d4, {d4, d5}, d8 \n"
|
||||
"vtbl.8 d5, {d6, d7}, d8 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst1.8 {q0}, [%1]! \n" // store 4 pixels
|
||||
"vst1.8 {q2}, [%1]! \n" // store 4 pixels
|
||||
"bgt 1b \n"
|
||||
@ -3472,6 +3466,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%5 \n" // top
|
||||
"vld1.8 {d1}, [%0],%6 \n"
|
||||
"subs %4, %4, #8 \n" // 8 pixels
|
||||
"vsubl.u8 q0, d0, d1 \n"
|
||||
"vld1.8 {d2}, [%1],%5 \n" // center * 2
|
||||
"vld1.8 {d3}, [%1],%6 \n"
|
||||
@ -3480,7 +3475,6 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
"vadd.s16 q0, q0, q1 \n"
|
||||
"vld1.8 {d2}, [%2],%5 \n" // bottom
|
||||
"vld1.8 {d3}, [%2],%6 \n"
|
||||
"subs %4, %4, #8 \n" // 8 pixels
|
||||
"vsubl.u8 q1, d2, d3 \n"
|
||||
"vadd.s16 q0, q0, q1 \n"
|
||||
"vabs.s16 q0, q0 \n"
|
||||
@ -3510,6 +3504,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%4 \n" // left
|
||||
"vld1.8 {d1}, [%1],%4 \n"
|
||||
"subs %3, %3, #8 \n" // 8 pixels
|
||||
"vsubl.u8 q0, d0, d1 \n"
|
||||
"vld1.8 {d2}, [%0],%4 \n" // center * 2
|
||||
"vld1.8 {d3}, [%1],%4 \n"
|
||||
@ -3518,7 +3513,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
"vadd.s16 q0, q0, q1 \n"
|
||||
"vld1.8 {d2}, [%0],%5 \n" // right
|
||||
"vld1.8 {d3}, [%1],%5 \n"
|
||||
"subs %3, %3, #8 \n" // 8 pixels
|
||||
"vsubl.u8 q1, d2, d3 \n"
|
||||
"vadd.s16 q0, q0, q1 \n"
|
||||
"vabs.s16 q0, q0 \n"
|
||||
@ -3613,6 +3607,7 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
"1: \n"
|
||||
"vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
|
||||
"vld1.16 {q2}, [%4]! \n"
|
||||
"subs %6, %6, #8 \n" // 8 processed per loop
|
||||
"vaddl.u16 q0, d2, d4 \n" // * 1
|
||||
"vaddl.u16 q1, d3, d5 \n" // * 1
|
||||
"vld1.16 {q2}, [%1]! \n"
|
||||
@ -3624,7 +3619,6 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
"vld1.16 {q2}, [%3]! \n"
|
||||
"vmlal.u16 q0, d4, d6 \n" // * 4
|
||||
"vmlal.u16 q1, d5, d6 \n" // * 4
|
||||
"subs %6, %6, #8 \n" // 8 processed per loop
|
||||
"vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
|
||||
"bgt 1b \n"
|
||||
: "+r"(src0), // %0
|
||||
@ -3650,6 +3644,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
"1: \n"
|
||||
"vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
|
||||
"vld1.32 {q2}, [%0] \n"
|
||||
"subs %5, %5, #8 \n" // 8 processed per loop
|
||||
"vadd.u32 q0, q0, q1 \n" // * 1
|
||||
"vadd.u32 q1, q1, q2 \n" // * 1
|
||||
"vld1.32 {q2, q3}, [%2]! \n"
|
||||
@ -3661,7 +3656,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
"vadd.u32 q3, q3, q9 \n"
|
||||
"vmla.u32 q0, q2, q10 \n" // * 4
|
||||
"vmla.u32 q1, q3, q10 \n" // * 4
|
||||
"subs %5, %5, #8 \n" // 8 processed per loop
|
||||
"vqshrn.u32 d0, q0, #8 \n" // round and pack
|
||||
"vqshrn.u32 d1, q1, #8 \n"
|
||||
"vst1.u16 {q0}, [%4]! \n" // store 8 samples
|
||||
@ -3685,11 +3679,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
"1: \n"
|
||||
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
|
||||
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
|
||||
"subs %3, %3, #16 \n" // 16 pixels per loop
|
||||
"vmov d1, d0 \n"
|
||||
"vzip.u8 d0, d1 \n" // VV
|
||||
"vmov d3, d2 \n"
|
||||
"vzip.u8 d2, d3 \n" // UU
|
||||
"subs %3, %3, #16 \n" // 16 pixels per loop
|
||||
"vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
|
||||
"vst3.8 {d1, d3, d5}, [%2]! \n"
|
||||
"bgt 1b \n"
|
||||
@ -3711,6 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
|
||||
// pixels.
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
|
||||
@ -3721,7 +3716,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
|
||||
"vqrshrun.s16 d0, q1, #2 \n"
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
@ -3742,6 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
|
||||
// pixels.
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
|
||||
@ -3752,7 +3747,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
|
||||
"vqrshrun.s16 d1, q1, #2 \n"
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
@ -3786,8 +3780,8 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
||||
"vld2.8 {d1, d3}, [%0]! \n"
|
||||
"vmov.u8 q2, q0 \n" // move U after V
|
||||
"subs %2, %2, #16 \n" // 16 pixels per loop
|
||||
"vmov.u8 q2, q0 \n" // move U after V
|
||||
"vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
@ -3811,13 +3805,13 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
"vld1.8 {q1}, [%2]! \n" // load 16 V values
|
||||
"vld1.8 {q2}, [%1]! \n"
|
||||
"vld1.8 {q3}, [%3]! \n"
|
||||
"subs %5, %5, #16 \n" // 16 src pixels per loop
|
||||
"vpaddl.u8 q0, q0 \n" // half size
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
"vpadal.u8 q0, q2 \n"
|
||||
"vpadal.u8 q1, q3 \n"
|
||||
"vqrshrn.u16 d0, q0, #2 \n"
|
||||
"vqrshrn.u16 d1, q1, #2 \n"
|
||||
"subs %5, %5, #16 \n" // 16 src pixels per loop
|
||||
"vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
@ -3840,9 +3834,9 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
|
||||
"vdup.16 q2, %4 \n"
|
||||
"1: \n"
|
||||
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
|
||||
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||
"vshl.u16 q0, q0, q2 \n"
|
||||
"vshl.u16 q1, q1, q2 \n"
|
||||
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||
"vst1.16 {q0}, [%1]! \n" // store 8 U pixels
|
||||
"vst1.16 {q1}, [%2]! \n" // store 8 V pixels
|
||||
"bgt 1b \n"
|
||||
@ -3865,9 +3859,9 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
||||
"vld1.16 {q1}, [%1]! \n" // load 8 V
|
||||
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||
"vshl.u16 q0, q0, q2 \n"
|
||||
"vshl.u16 q1, q1, q2 \n"
|
||||
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
@ -3887,11 +3881,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"vmul.u16 q0, q0, q2 \n"
|
||||
"vmul.u16 q1, q1, q2 \n"
|
||||
"vst1.16 {q0}, [%1]! \n"
|
||||
"vst1.16 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -3908,6 +3902,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"vdup.16 d8, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q2, q3}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"vmull.u16 q0, d4, d8 \n"
|
||||
"vmull.u16 q1, d5, d8 \n"
|
||||
"vmull.u16 q2, d6, d8 \n"
|
||||
@ -3917,7 +3912,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"vshrn.u32 d2, q2, #16 \n"
|
||||
"vshrn.u32 d3, q3, #16 \n"
|
||||
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
@ -3941,11 +3935,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"vshl.u16 q0, q0, q2 \n" // shr = q2 is negative
|
||||
"vshl.u16 q1, q1, q2 \n"
|
||||
"vqmovn.u16 d0, q0 \n"
|
||||
"vqmovn.u16 d1, q1 \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"vst1.8 {q0}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -3955,6 +3949,41 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
// Use scale to convert J420 to I420
|
||||
// scale parameter is 8.8 fixed point but limited to 0 to 255
|
||||
// Function is based on DivideRow, but adds a bias
|
||||
// Does not clamp
|
||||
void Convert8To8Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int bias,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.8 d8, %3 \n"
|
||||
"vdup.8 q5, %4 \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q2, q3}, [%0]! \n"
|
||||
"subs %2, %2, #32 \n" // 32 src pixels per loop
|
||||
"vmull.u8 q0, d4, d8 \n"
|
||||
"vmull.u8 q1, d5, d8 \n"
|
||||
"vmull.u8 q2, d6, d8 \n"
|
||||
"vmull.u8 q3, d7, d8 \n"
|
||||
"vshrn.u16 d0, q0, #8 \n"
|
||||
"vshrn.u16 d1, q1, #8 \n"
|
||||
"vshrn.u16 d2, q2, #8 \n"
|
||||
"vshrn.u16 d3, q3, #8 \n"
|
||||
"vadd.u8 q0, q0, q5 \n"
|
||||
"vadd.u8 q1, q1, q5 \n"
|
||||
"vst1.8 {q0, q1}, [%1]! \n" // store 32 pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale), // %3
|
||||
"r"(bias) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "d8", "q5");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -242,9 +242,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n" /* A */
|
||||
"1: \n" READYUV444 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV444
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -265,9 +264,8 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV444 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV444
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -293,8 +291,8 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"dup v23.8h, %w[alpha] \n"
|
||||
"1: \n" READYUV210 NVTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUV210
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
@ -321,8 +319,8 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"dup v23.8h, %w[alpha] \n"
|
||||
"1: \n" READYUV410 NVTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUV410
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
@ -348,8 +346,8 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||
"1: \n" READYUV212 NVTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUV212
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
@ -371,9 +369,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"1: \n" READYUV210 NVTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV210
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -395,9 +392,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"1: \n" READYUV410 NVTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV410
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -421,9 +417,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"1: \n" READYUV212 NVTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV212
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -445,9 +440,8 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n" /* A */
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -475,8 +469,8 @@ void P210ToARGBRow_NEON(const uint16_t* src_y,
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kIndices]] \n"
|
||||
"1: \n" //
|
||||
READYUVP210 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
READYUVP210
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -504,8 +498,8 @@ void P410ToARGBRow_NEON(const uint16_t* src_y,
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kIndices]] \n"
|
||||
"1: \n" //
|
||||
READYUVP410 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
READYUVP410
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -526,23 +520,22 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
|
||||
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
|
||||
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
||||
const uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||
"ldr q2, [%[kIndices]] \n"
|
||||
"1: \n" READYUVP210 NVTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUVP210
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
|
||||
[limit] "r"(limit), // %[limit]
|
||||
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
|
||||
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
|
||||
[limit] "r"(limit), // %[limit]
|
||||
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
|
||||
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
|
||||
}
|
||||
|
||||
void P410ToAR30Row_NEON(const uint16_t* src_y,
|
||||
@ -553,23 +546,22 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
|
||||
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
|
||||
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
|
||||
uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
asm volatile(YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||
"ldr q2, [%[kIndices]] \n"
|
||||
"1: \n" READYUVP410 NVTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUVP410
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
|
||||
[limit] "r"(limit), // %[limit]
|
||||
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
|
||||
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_uv] "+r"(src_uv), // %[src_uv]
|
||||
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
|
||||
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
|
||||
[limit] "r"(limit), // %[limit]
|
||||
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
|
||||
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
|
||||
}
|
||||
|
||||
void I422ToAR30Row_NEON(const uint8_t* src_y,
|
||||
@ -585,8 +577,8 @@ void I422ToAR30Row_NEON(const uint8_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"dup v22.8h, %w[limit] \n"
|
||||
"movi v23.8h, #0xc0, lsl #8 \n" // A
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
"subs %w[width], %w[width], #8 \n" STOREAR30
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB STOREAR30
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
[src_u] "+r"(src_u), // %[src_u]
|
||||
@ -610,8 +602,8 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"1: \n"
|
||||
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
|
||||
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -636,8 +628,8 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"1: \n"
|
||||
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
|
||||
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -662,8 +654,8 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"1: \n"
|
||||
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
|
||||
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -688,8 +680,8 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"1: \n"
|
||||
"ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
|
||||
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"prfm pldl1keep, [%[src_a], 448] \n" I4XXTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -712,9 +704,8 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v15.8b, #255 \n" /* A */
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -735,9 +726,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -777,9 +767,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
RGBTORGB8_TOP
|
||||
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP
|
||||
ARGBTORGB565_FROM_TOP
|
||||
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -818,10 +808,9 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8h, #0x80, lsl #8 \n"
|
||||
"1: \n" //
|
||||
READYUV422 I4XXTORGB RGBTORGB8_TOP
|
||||
"subs %w[width], %w[width], #8 \n" //
|
||||
ARGBTOARGB1555_FROM_TOP
|
||||
"1: \n" //
|
||||
READYUV422 "subs %w[width], %w[width], #8 \n" //
|
||||
I4XXTORGB RGBTORGB8_TOP ARGBTOARGB1555_FROM_TOP
|
||||
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
|
||||
// RGB1555.
|
||||
"b.gt 1b \n"
|
||||
@ -849,9 +838,8 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 I4XXTORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUV422
|
||||
"subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8
|
||||
"movi v19.8b, #255 \n" ARGBTOARGB4444
|
||||
"st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
|
||||
// pixels
|
||||
@ -880,8 +868,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
||||
"umull v4.8h, v1.8b, v28.8b \n" /* DB */
|
||||
"umull2 v5.8h, v1.16b, v29.16b \n" /* DR */
|
||||
"1: \n" READYUV400 I400TORGB
|
||||
RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"subs %w[width], %w[width], #8 \n" RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -898,10 +885,10 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
"movi v23.8b, #255 \n"
|
||||
"1: \n"
|
||||
"ld1 {v20.8b}, [%0], #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mov v21.8b, v20.8b \n"
|
||||
"mov v22.8b, v20.8b \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -941,8 +928,8 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kNV12Table]] \n"
|
||||
"1: \n" READNV12 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READNV12
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -964,8 +951,8 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kNV12Table]] \n"
|
||||
"1: \n" READNV12 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READNV12
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -986,8 +973,8 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"ldr q2, [%[kNV12Table]] \n"
|
||||
"1: \n" READNV12 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READNV12
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -1008,8 +995,8 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"ldr q2, [%[kNV12Table]] \n"
|
||||
"1: \n" READNV12 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READNV12
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_y] "+r"(src_y), // %[src_y]
|
||||
@ -1030,9 +1017,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"ldr q2, [%[kNV12Table]] \n"
|
||||
"1: \n" READNV12 NVTORGB
|
||||
RGBTORGB8_TOP
|
||||
"subs %w[width], %w[width], #8 \n" ARGBTORGB565_FROM_TOP
|
||||
"1: \n" READNV12
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP
|
||||
ARGBTORGB565_FROM_TOP
|
||||
"st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
|
||||
// pixels
|
||||
// RGB565.
|
||||
@ -1055,8 +1042,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kNV21InterleavedTable]] \n"
|
||||
"1: \n" READYUY2 NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READYUY2
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
|
||||
@ -1076,8 +1063,8 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
|
||||
YUVTORGB_SETUP
|
||||
"movi v19.8b, #255 \n"
|
||||
"ldr q2, [%[kNV12InterleavedTable]] \n"
|
||||
"1: \n" READUYVY NVTORGB RGBTORGB8
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"1: \n" READUYVY
|
||||
"subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8
|
||||
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
|
||||
@ -1188,10 +1175,10 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
|
||||
"subs %w3, %w3, #16 \n" // store 8 YUY2
|
||||
"prfm pldl1keep, [%0, 1792] \n"
|
||||
"ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
|
||||
"prfm pldl1keep, [%1, 1792] \n"
|
||||
"subs %w3, %w3, #16 \n" // store 8 YUY2
|
||||
"st2 {v0.16b,v1.16b}, [%2], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -1240,10 +1227,10 @@ void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
"1: \n"
|
||||
"ld1 {v7.16b}, [%0], #16 \n"
|
||||
"ld1 {v0.16b-v3.16b}, [%0], #64 \n"
|
||||
"subs %2, %2, #80 \n"
|
||||
"shl v4.16b, v7.16b, #6 \n"
|
||||
"shl v5.16b, v7.16b, #4 \n"
|
||||
"shl v6.16b, v7.16b, #2 \n"
|
||||
"subs %2, %2, #80 \n"
|
||||
"zip1 v16.16b, v4.16b, v0.16b \n"
|
||||
"zip1 v18.16b, v5.16b, v1.16b \n"
|
||||
"zip1 v20.16b, v6.16b, v2.16b \n"
|
||||
@ -1305,8 +1292,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||
"dup v2.8h, %w4 \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
|
||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
|
||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||
"ushl v0.8h, v0.8h, v2.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ushl v1.8h, v1.8h, v2.8h \n"
|
||||
@ -1356,8 +1343,8 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||
"dup v4.8h, %w4 \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%0], #16 \n" // load 8 U
|
||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||
"ld1 {v1.8h}, [%1], #16 \n" // load 8 V
|
||||
"subs %w3, %w3, #8 \n" // 8 src pixels per loop
|
||||
"ushl v0.8h, v0.8h, v4.8h \n"
|
||||
"ushl v1.8h, v1.8h, v4.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
@ -1596,6 +1583,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
"ldr d2, [%2], #8 \n" // B
|
||||
"ldr d1, [%1], #8 \n" // G
|
||||
"ldr d0, [%0], #8 \n" // R
|
||||
"subs %w4, %w4, #4 \n"
|
||||
"ushll v2.4s, v2.4h, #0 \n" // B
|
||||
"ushll v1.4s, v1.4h, #0 \n" // G
|
||||
"ushll v0.4s, v0.4h, #0 \n" // R
|
||||
@ -1608,7 +1596,6 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
"sli v2.4s, v1.4s, #10 \n" // 00GB
|
||||
"sli v2.4s, v0.4s, #20 \n" // 0RGB
|
||||
"orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
|
||||
"subs %w4, %w4, #4 \n"
|
||||
"str q2, [%3], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1637,6 +1624,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
"ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr
|
||||
"ldr q1, [%1], #16 \n" // xxxxxxGggggggggg
|
||||
"ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr
|
||||
"umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg
|
||||
"movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000
|
||||
@ -1644,7 +1632,6 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
"mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000
|
||||
"mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb
|
||||
"usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st2 {v3.8h, v4.8h}, [%3], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1674,6 +1661,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"ldr q3, [%3], #16 \n" // A
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"umin v2.8h, v2.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umin v1.8h, v1.8h, v30.8h \n"
|
||||
@ -1686,7 +1674,6 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"ushl v3.8h, v3.8h, v31.8h \n"
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1718,6 +1705,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
"ldr q2, [%0], #16 \n" // R
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q0, [%2], #16 \n" // B
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"umin v2.8h, v2.8h, v30.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umin v1.8h, v1.8h, v30.8h \n"
|
||||
@ -1727,7 +1715,6 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
"ushl v2.8h, v2.8h, v31.8h \n"
|
||||
"ushl v1.8h, v1.8h, v31.8h \n"
|
||||
"ushl v0.8h, v0.8h, v31.8h \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_r), // %0
|
||||
@ -1756,6 +1743,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q2, [%2], #16 \n" // R
|
||||
"ldr q3, [%3], #16 \n" // A
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"uqshl v0.8h, v0.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uqshl v1.8h, v1.8h, v31.8h \n"
|
||||
@ -1766,7 +1754,6 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"trn2 v0.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v1.16b, v2.16b, v3.16b \n"
|
||||
"subs %w5, %w5, #8 \n"
|
||||
"st2 {v0.8h, v1.8h}, [%4], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_b), // %0
|
||||
@ -1794,6 +1781,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"ldr q0, [%0], #16 \n" // B
|
||||
"ldr q1, [%1], #16 \n" // G
|
||||
"ldr q2, [%2], #16 \n" // R
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"uqshl v0.8h, v0.8h, v31.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uqshl v1.8h, v1.8h, v31.8h \n"
|
||||
@ -1802,7 +1790,6 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"trn2 v0.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v1.16b, v2.16b, v3.16b \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"st2 {v0.8h, v1.8h}, [%3], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_b), // %0
|
||||
@ -1994,8 +1981,8 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
||||
"1: \n"
|
||||
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
|
||||
// RGB24.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
@ -2471,11 +2458,11 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
"mov v3.8b, v2.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -2569,10 +2556,10 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"mov v1.16b, v0.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mov v3.16b, v2.16b \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
|
||||
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
|
||||
"b.gt 1b \n"
|
||||
@ -2593,12 +2580,12 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||
"ldr q4, [%3] \n" // shuffler
|
||||
"1: \n"
|
||||
"ldp q0, q2, [%0], #32 \n" // load 8 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"tbl v0.16b, {v0.16b}, v4.16b \n"
|
||||
"tbl v2.16b, {v2.16b}, v4.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mov v1.16b, v0.16b \n"
|
||||
"mov v3.16b, v2.16b \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
|
||||
"st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
|
||||
"b.gt 1b \n"
|
||||
@ -2668,10 +2655,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
|
||||
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"stp q0, q2, [%1], #32 \n" // store 8 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ar64), // %0
|
||||
@ -2692,10 +2679,10 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n" // load 4 pixels
|
||||
"ldp q2, q3, [%0], #32 \n" // load 4 pixels
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"stp q0, q2, [%1], #32 \n" // store 8 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ab64), // %0
|
||||
@ -2786,6 +2773,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
"movi v29.16b, #0x80 \n" // 128.5
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src]], #32 \n"
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
|
||||
"movi v2.4s, #0 \n"
|
||||
"movi v3.4s, #0 \n"
|
||||
"movi v4.4s, #0 \n"
|
||||
@ -2795,7 +2783,6 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
"usdot v4.4s, v0.16b, v17.16b \n"
|
||||
"usdot v5.4s, v1.16b, v17.16b \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
|
||||
"uzp1 v0.8h, v2.8h, v3.8h \n"
|
||||
"uzp1 v1.8h, v4.8h, v5.8h \n"
|
||||
"addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned
|
||||
@ -2877,6 +2864,7 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -2892,7 +2880,6 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -2924,6 +2911,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -2938,7 +2926,6 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -2969,6 +2956,7 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -2983,7 +2971,6 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3014,6 +3001,7 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
|
||||
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
|
||||
"1: \n"
|
||||
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3028,7 +3016,6 @@ void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3059,6 +3046,7 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
|
||||
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
|
||||
"1: \n"
|
||||
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3073,7 +3061,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3099,6 +3086,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3113,7 +3101,6 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
"urshr v1.8h, v3.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3139,6 +3126,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3153,7 +3141,6 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v2.8h, v1.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3179,6 +3166,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3193,7 +3181,6 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3219,6 +3206,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3233,7 +3221,6 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3259,6 +3246,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
@ -3273,7 +3261,6 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v0.8h, v0.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3300,6 +3287,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGB565TOARGB
|
||||
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
@ -3317,7 +3305,6 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
"urshr v1.8h, v17.8h, #1 \n"
|
||||
"urshr v2.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3344,6 +3331,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGB555TOARGB
|
||||
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
@ -3361,7 +3349,6 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
"urshr v1.8h, v17.8h, #1 \n"
|
||||
"urshr v2.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -3388,6 +3375,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
RGBTOUV_SETUP_REG // sets v20-v25
|
||||
"1: \n"
|
||||
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
ARGB4444TORGB
|
||||
"uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
@ -3405,7 +3393,6 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
"urshr v1.8h, v17.8h, #1 \n"
|
||||
"urshr v2.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
@ -4210,9 +4197,9 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb,
|
||||
"ldr q25, [%[indices]] \n"
|
||||
"1: \n"
|
||||
"ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
|
||||
"movi v0.4s, #0 \n"
|
||||
"movi v2.4s, #0 \n"
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
|
||||
"udot v0.4s, v1.16b, v24.16b \n"
|
||||
"udot v2.4s, v3.16b, v24.16b \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
@ -4281,6 +4268,7 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
|
||||
"ldr d23, [%[indices]] \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[dst]] \n"
|
||||
"subs %w1, %w1, #8 \n"
|
||||
"movi v2.4s, #0 \n"
|
||||
"movi v3.4s, #0 \n"
|
||||
"movi v4.4s, #0 \n"
|
||||
@ -4293,7 +4281,6 @@ void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) {
|
||||
"udot v5.4s, v1.16b, v21.16b \n"
|
||||
"udot v6.4s, v0.16b, v22.16b \n"
|
||||
"udot v7.4s, v1.16b, v22.16b \n"
|
||||
"subs %w1, %w1, #8 \n"
|
||||
"prfm pldl1keep, [%[dst], 448] \n"
|
||||
"uzp1 v6.8h, v6.8h, v7.8h \n"
|
||||
"uzp1 v5.8h, v4.8h, v5.8h \n"
|
||||
@ -4383,6 +4370,7 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
|
||||
"movi v16.4s, #0 \n"
|
||||
"movi v17.4s, #0 \n"
|
||||
@ -4393,8 +4381,6 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
"movi v22.4s, #0 \n"
|
||||
"movi v23.4s, #0 \n"
|
||||
|
||||
// 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"prfm pldl1keep, [%[src_argb], 448] \n"
|
||||
|
||||
"sudot v16.4s, v31.16b, v0.4b[0] \n"
|
||||
@ -4609,6 +4595,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
"1: \n"
|
||||
"ld1 {v0.8b}, [%0],%5 \n" // top
|
||||
"ld1 {v1.8b}, [%0],%6 \n"
|
||||
"subs %w4, %w4, #8 \n" // 8 pixels
|
||||
"usubl v0.8h, v0.8b, v1.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ld1 {v2.8b}, [%1],%5 \n" // center * 2
|
||||
@ -4619,7 +4606,6 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"ld1 {v2.8b}, [%2],%5 \n" // bottom
|
||||
"ld1 {v3.8b}, [%2],%6 \n"
|
||||
"subs %w4, %w4, #8 \n" // 8 pixels
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
@ -4650,6 +4636,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
"1: \n"
|
||||
"ld1 {v0.8b}, [%0],%4 \n" // left
|
||||
"ld1 {v1.8b}, [%1],%4 \n"
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels
|
||||
"usubl v0.8h, v0.8b, v1.8b \n"
|
||||
"ld1 {v2.8b}, [%0],%4 \n" // center * 2
|
||||
"ld1 {v3.8b}, [%1],%4 \n"
|
||||
@ -4658,7 +4645,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"ld1 {v2.8b}, [%0],%5 \n" // right
|
||||
"ld1 {v3.8b}, [%1],%5 \n"
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
@ -4881,8 +4867,8 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
||||
@ -4909,6 +4895,7 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
"1: \n"
|
||||
"ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.8h}, [%4], #16 \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"uaddl v0.4s, v1.4h, v2.4h \n" // * 1
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
|
||||
@ -4924,7 +4911,6 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
"umlal v0.4s, v2.4h, v6.4h \n" // * 4
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"prfm pldl1keep, [%4, 448] \n"
|
||||
"b.gt 1b \n"
|
||||
@ -4950,6 +4936,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"add v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"add v1.4s, v1.4s, v2.4s \n" // * 1
|
||||
"ld1 {v2.4s,v3.4s}, [%2], #32 \n"
|
||||
@ -4962,7 +4949,6 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"mla v1.4s, v3.4s, v6.4s \n" // * 4
|
||||
"subs %w5, %w5, #8 \n" // 8 processed per loop
|
||||
"uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
|
||||
"uqrshrn2 v0.8h, v1.4s, #8 \n"
|
||||
"st1 {v0.8h}, [%4], #16 \n" // store 8 samples
|
||||
@ -4993,6 +4979,7 @@ void GaussCol_F32_NEON(const float* src0,
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.4s, v3.4s}, [%1], #32 \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
@ -5009,7 +4996,6 @@ void GaussCol_F32_NEON(const float* src0,
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"fadd v1.4s, v1.4s, v5.4s \n"
|
||||
"prfm pldl1keep, [%4, 448] \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src0), // %0
|
||||
@ -5031,6 +5017,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
|
||||
// rows
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fadd v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"ld1 {v4.4s, v5.4s}, [%0], %5 \n"
|
||||
"fadd v1.4s, v1.4s, v2.4s \n"
|
||||
@ -5045,7 +5032,6 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) {
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
|
||||
"fmul v1.4s, v1.4s, v8.4s \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
@ -5068,11 +5054,11 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
"1: \n"
|
||||
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
|
||||
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -5100,12 +5086,12 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels per loop
|
||||
"st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -5129,6 +5115,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
@ -5138,7 +5125,6 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v2.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
@ -5158,6 +5144,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
@ -5167,7 +5154,6 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
@ -5227,6 +5213,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
"ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
|
||||
"ld1 {v2.16b}, [%1], #16 \n"
|
||||
"ld1 {v3.16b}, [%3], #16 \n"
|
||||
"subs %w5, %w5, #16 \n" // 16 src pixels per loop
|
||||
"uaddlp v0.8h, v0.16b \n" // half size
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n"
|
||||
@ -5237,7 +5224,6 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
"prfm pldl1keep, [%3, 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #2 \n"
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w5, %w5, #16 \n" // 16 src pixels per loop
|
||||
"st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
@ -5376,7 +5362,7 @@ void Convert8To8Row_NEON(const uint8_t* src_y,
|
||||
"uzp2 v1.16b, v2.16b, v3.16b \n"
|
||||
"add v0.16b, v0.16b, v5.16b \n" // add bias (16)
|
||||
"add v1.16b, v1.16b, v5.16b \n"
|
||||
"stp q0, q1, [%1], #32 \n" // store 16 pixels
|
||||
"stp q0, q1, [%1], #32 \n" // store 32 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user