mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
row_neon64 fix for warning on ios where int width doesnt match %2 size which is 64 bit by default. change size to explicitely 32 bit with %w2.
BUG=437 TESTED=try bots R=bcornell@google.com Review URL: https://webrtc-codereview.appspot.com/47119004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1399 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
6d5554661f
commit
a20e2c6213
@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV444
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV411
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v21, v22, v23)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v20.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v20, v21, v22)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v23, v22, v21)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v20.8b, #255 \n" /* A */
|
||||
MEMACCESS(3)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
|
||||
@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
MEMACCESS(3)
|
||||
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v20, v21, v22)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
MEMACCESS(3)
|
||||
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(3)
|
||||
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
|
||||
@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
ARGBTOARGB1555
|
||||
MEMACCESS(3)
|
||||
@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %4, %4, #8 \n"
|
||||
"subs %w4, %w4, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
ARGBTOARGB4444
|
||||
MEMACCESS(3)
|
||||
@ -517,13 +517,13 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
|
||||
void I400ToARGBRow_NEON(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READYUV400
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
@ -550,7 +550,7 @@ void J400ToARGBRow_NEON(const uint8* src_y,
|
||||
"ld1 {v20.8b}, [%0], #8 \n"
|
||||
"orr v21.8b, v20.8b, v20.8b \n"
|
||||
"orr v22.8b, v20.8b, v20.8b \n"
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -573,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV12
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(2)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
|
||||
@ -600,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV21
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(2)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
|
||||
@ -627,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV12
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
|
||||
@ -654,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
|
||||
"1: \n"
|
||||
READNV21
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
ARGBTORGB565
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
|
||||
@ -675,13 +675,13 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
|
||||
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READYUY2
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
@ -701,13 +701,13 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
|
||||
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
int width) {
|
||||
int64 width64 = (int64)(width);
|
||||
int64 width64 = (int64)(width);
|
||||
asm volatile (
|
||||
YUV422TORGB_SETUP_REG
|
||||
"1: \n"
|
||||
READUYVY
|
||||
YUV422TORGB(v22, v21, v20)
|
||||
"subs %2, %2, #8 \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"movi v23.8b, #255 \n"
|
||||
MEMACCESS(1)
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
|
||||
@ -731,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store U
|
||||
MEMACCESS(2)
|
||||
@ -757,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load U
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load V
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop
|
||||
MEMACCESS(2)
|
||||
"st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
|
||||
"b.gt 1b \n"
|
||||
@ -779,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
|
||||
"subs %2, %2, #32 \n" // 32 processed per loop
|
||||
"subs %w2, %w2, #32 \n" // 32 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
|
||||
"b.gt 1b \n"
|
||||
@ -797,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) {
|
||||
asm volatile (
|
||||
"dup v0.16b, %w2 \n" // duplicate 16 bytes
|
||||
"1: \n"
|
||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||
"subs %w1, %w1, #16 \n" // 16 bytes per loop
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
@ -812,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile (
|
||||
"dup v0.4s, %w2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"subs %1, %1, #4 \n" // 4 ints per loop
|
||||
"subs %w1, %w1, #4 \n" // 4 ints per loop
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n" // store
|
||||
"b.gt 1b \n"
|
||||
@ -833,7 +833,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
||||
"subs %2, %2, #16 \n" // 16 pixels per loop.
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop.
|
||||
"rev64 v0.16b, v0.16b \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
||||
@ -860,7 +860,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
|
||||
"subs %3, %3, #8 \n" // 8 pixels per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels per loop.
|
||||
"rev64 v0.8b, v0.8b \n"
|
||||
"rev64 v1.8b, v1.8b \n"
|
||||
MEMACCESS(1)
|
||||
@ -888,7 +888,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
|
||||
"subs %2, %2, #4 \n" // 4 pixels per loop.
|
||||
"subs %w2, %w2, #4 \n" // 4 pixels per loop.
|
||||
"rev64 v0.4s, v0.4s \n"
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
|
||||
@ -911,7 +911,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
"b.gt 1b \n"
|
||||
@ -931,7 +931,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
MEMACCESS(1)
|
||||
@ -966,7 +966,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1025,7 +1025,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1058,7 +1058,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
MEMACCESS(1)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
|
||||
@ -1078,7 +1078,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
|
||||
"b.gt 1b \n"
|
||||
@ -1097,7 +1097,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v4.8b, v2.8b, v2.8b \n" // mov g
|
||||
"orr v5.8b, v1.8b, v1.8b \n" // mov b
|
||||
MEMACCESS(1)
|
||||
@ -1118,7 +1118,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"b.gt 1b \n"
|
||||
@ -1137,7 +1137,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
|
||||
"b.gt 1b \n"
|
||||
@ -1157,7 +1157,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 U.
|
||||
MEMACCESS(2)
|
||||
@ -1180,7 +1180,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 U.
|
||||
MEMACCESS(2)
|
||||
@ -1204,7 +1204,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
"urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
|
||||
@ -1234,7 +1234,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
|
||||
"urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
|
||||
@ -1264,7 +1264,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.8b}, [%1], #8 \n" // store 8 G's.
|
||||
"b.gt 1b \n"
|
||||
@ -1287,7 +1287,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
|
||||
"subs %2, %2, #4 \n" // 4 processed per loop
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
|
||||
MEMACCESS(1)
|
||||
"st1 {v1.16b}, [%1], #16 \n" // store 4.
|
||||
@ -1315,7 +1315,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
||||
MEMACCESS(2)
|
||||
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %4, %4, #16 \n" // 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
MEMACCESS(3)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"b.gt 1b \n"
|
||||
@ -1344,7 +1344,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||
MEMACCESS(2)
|
||||
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||
"subs %4, %4, #16 \n" // 16 pixels
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
MEMACCESS(3)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
|
||||
"b.gt 1b \n"
|
||||
@ -1365,7 +1365,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTORGB565
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
|
||||
@ -1411,7 +1411,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB1555
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
|
||||
@ -1433,7 +1433,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGBTOARGB4444
|
||||
MEMACCESS(1)
|
||||
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
|
||||
@ -1457,7 +1457,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
||||
@ -1484,7 +1484,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
||||
@ -1515,7 +1515,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlsl v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlsl v4.8h, v2.8b, v26.8b \n" // R
|
||||
@ -1559,7 +1559,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"mul v3.8h, v0.8h, v20.8h \n" // B
|
||||
"mls v3.8h, v1.8h, v21.8h \n" // G
|
||||
"mls v3.8h, v2.8h, v22.8h \n" // R
|
||||
@ -1615,7 +1615,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %3, %3, #32 \n" // 32 processed per loop.
|
||||
"subs %w3, %w3, #32 \n" // 32 processed per loop.
|
||||
"mul v3.8h, v0.8h, v20.8h \n" // B
|
||||
"mls v3.8h, v1.8h, v21.8h \n" // G
|
||||
"mls v3.8h, v2.8h, v22.8h \n" // R
|
||||
@ -1681,7 +1681,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1728,7 +1728,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1769,7 +1769,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
|
||||
"urshr v1.8h, v3.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1810,7 +1810,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v2.8h, v1.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1851,7 +1851,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1892,7 +1892,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v2.8h, v2.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1933,7 +1933,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
|
||||
"urshr v1.8h, v1.8h, #1 \n"
|
||||
"urshr v0.8h, v0.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 32 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 32 processed per loop.
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
@ -1999,7 +1999,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
|
||||
"urshr v5.8h, v18.8h, #1 \n"
|
||||
"urshr v6.8h, v20.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v16.8h, v4.8h, v22.8h \n" // B
|
||||
"mls v16.8h, v5.8h, v23.8h \n" // G
|
||||
"mls v16.8h, v6.8h, v24.8h \n" // R
|
||||
@ -2070,7 +2070,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
|
||||
"urshr v5.8h, v17.8h, #1 \n"
|
||||
"urshr v6.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v2.8h, v4.8h, v20.8h \n" // B
|
||||
"mls v2.8h, v5.8h, v21.8h \n" // G
|
||||
"mls v2.8h, v6.8h, v22.8h \n" // R
|
||||
@ -2141,7 +2141,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
|
||||
"urshr v5.8h, v17.8h, #1 \n"
|
||||
"urshr v6.8h, v18.8h, #1 \n"
|
||||
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"mul v2.8h, v4.8h, v20.8h \n" // B
|
||||
"mls v2.8h, v5.8h, v21.8h \n" // G
|
||||
"mls v2.8h, v6.8h, v22.8h \n" // R
|
||||
@ -2181,7 +2181,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
RGB565TOARGB
|
||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
||||
@ -2211,7 +2211,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB1555TOARGB
|
||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
||||
@ -2240,7 +2240,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
ARGB4444TOARGB
|
||||
"umull v3.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v3.8h, v1.8b, v25.8b \n" // G
|
||||
@ -2269,7 +2269,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // B
|
||||
@ -2297,7 +2297,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // B
|
||||
@ -2325,7 +2325,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // R
|
||||
@ -2353,7 +2353,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
@ -2381,7 +2381,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
@ -2425,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"umull v2.8h, v0.8b, v4.8b \n"
|
||||
"umull2 v3.8h, v0.16b, v4.16b \n"
|
||||
"umlal v2.8h, v1.8b, v5.8b \n"
|
||||
@ -2443,7 +2443,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -2457,7 +2457,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
@ -2470,7 +2470,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"ld1 {v0.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
@ -2482,7 +2482,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"100: \n"
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"subs %w3, %w3, #16 \n"
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"b.gt 100b \n"
|
||||
@ -2505,7 +2505,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"subs %3, %3, #8 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"b.lt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
"8: \n"
|
||||
@ -2513,7 +2513,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
@ -2541,7 +2541,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
|
||||
"subs %3, %3, #1 \n" // 1 processed per loop.
|
||||
"subs %w3, %w3, #1 \n" // 1 processed per loop.
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
@ -2580,7 +2580,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v3.8b \n" // b * a
|
||||
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
||||
"umull v6.8h, v2.8b, v3.8b \n" // r * a
|
||||
@ -2614,7 +2614,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
|
||||
"uxtl v1.8h, v1.8b \n"
|
||||
"uxtl v2.8h, v2.8b \n"
|
||||
@ -2658,7 +2658,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
"uxtl v6.8h, v6.8b \n"
|
||||
@ -2695,7 +2695,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v26.8b \n" // R
|
||||
@ -2734,7 +2734,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"subs %w1, %w1, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
|
||||
"umlal v4.8h, v1.8b, v21.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v22.8b \n" // R
|
||||
@ -2774,7 +2774,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
|
||||
"uxtl v17.8h, v17.8b \n" // g
|
||||
"uxtl v18.8h, v18.8b \n" // r
|
||||
@ -2836,7 +2836,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v0.8h, v0.8b, v4.8b \n" // multiply B
|
||||
"umull v1.8h, v1.8b, v5.8b \n" // multiply G
|
||||
"umull v2.8h, v2.8b, v6.8b \n" // multiply R
|
||||
@ -2870,7 +2870,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n"
|
||||
"uqadd v1.8b, v1.8b, v5.8b \n"
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n"
|
||||
@ -2900,7 +2900,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
MEMACCESS(1)
|
||||
"ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqsub v0.8b, v0.8b, v4.8b \n"
|
||||
"uqsub v1.8b, v1.8b, v5.8b \n"
|
||||
"uqsub v2.8b, v2.8b, v6.8b \n"
|
||||
@ -2935,7 +2935,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v1.8b \n" // add
|
||||
"orr v1.8b, v0.8b, v0.8b \n"
|
||||
"orr v2.8b, v0.8b, v0.8b \n"
|
||||
@ -2963,7 +2963,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop.
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"uqadd v0.16b, v0.16b, v1.16b \n" // add
|
||||
MEMACCESS(2)
|
||||
"st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
|
||||
@ -2994,7 +2994,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
"ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
|
||||
MEMACCESS(1)
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v1.8b, v0.8b, v2.8b \n" // add
|
||||
MEMACCESS(2)
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
|
||||
@ -3034,7 +3034,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"ld1 {v2.8b}, [%2],%5 \n" // bottom
|
||||
MEMACCESS(2)
|
||||
"ld1 {v3.8b}, [%2],%6 \n"
|
||||
"subs %4, %4, #8 \n" // 8 pixels
|
||||
"subs %w4, %w4, #8 \n" // 8 pixels
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"abs v0.8h, v0.8h \n"
|
||||
@ -3079,7 +3079,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
|
||||
"ld1 {v2.8b}, [%0],%5 \n" // right
|
||||
MEMACCESS(1)
|
||||
"ld1 {v3.8b}, [%1],%5 \n"
|
||||
"subs %3, %3, #8 \n" // 8 pixels
|
||||
"subs %w3, %w3, #8 \n" // 8 pixels
|
||||
"usubl v1.8h, v2.8b, v3.8b \n"
|
||||
"add v0.8h, v0.8h, v1.8h \n"
|
||||
"abs v0.8h, v0.8h \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user