mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
[AArch64] Replace instances of ORR with MOV where possible
The MOV instruction is an alias of ORR where both registers are the same and should be preferred. Both ORR and MOV are not zero-cost instructions on all micro-architectures so there may be better ways to express these kernels, but this is left for a later commit. Bug: libyuv:975 Change-Id: I29b7f182a57a61855cb7f8a867691080f153b10b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5332385 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
9d660a0f3b
commit
9e223c3fc0
@ -3822,7 +3822,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
||||
"vld2.8 {d1, d3}, [%0]! \n"
|
||||
"vorr.u8 q2, q0, q0 \n" // move U after V
|
||||
"vmov.u8 q2, q0 \n" // move U after V
|
||||
"subs %2, %2, #16 \n" // 16 pixels per loop
|
||||
"vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
|
||||
"bgt 1b \n"
|
||||
|
||||
@ -448,8 +448,8 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
"1: \n"
|
||||
"ld1 {v20.8b}, [%0], #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v21.8b, v20.8b, v20.8b \n"
|
||||
"orr v22.8b, v20.8b, v20.8b \n"
|
||||
"mov v21.8b, v20.8b \n"
|
||||
"mov v22.8b, v20.8b \n"
|
||||
"subs %w2, %w2, #8 \n"
|
||||
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
@ -1561,9 +1561,9 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"mov v3.8b, v1.8b \n" // move g
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
"mov v4.8b, v0.8b \n" // move r
|
||||
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
@ -1580,9 +1580,9 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||
"1: \n"
|
||||
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v2.8b, v4.8b, v4.8b \n" // move g
|
||||
"mov v2.8b, v4.8b \n" // move g
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v1.8b, v5.8b, v5.8b \n" // move r
|
||||
"mov v1.8b, v5.8b \n" // move r
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
@ -1598,9 +1598,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
||||
"1: \n"
|
||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v3.8b, v1.8b, v1.8b \n" // move g
|
||||
"mov v3.8b, v1.8b \n" // move g
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v4.8b, v0.8b, v0.8b \n" // move r
|
||||
"mov v4.8b, v0.8b \n" // move r
|
||||
"st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
@ -1739,9 +1739,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
||||
"1: \n"
|
||||
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"orr v4.8b, v2.8b, v2.8b \n" // mov g
|
||||
"mov v4.8b, v2.8b \n" // mov g
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v5.8b, v1.8b, v1.8b \n" // mov b
|
||||
"mov v5.8b, v1.8b \n" // mov b
|
||||
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -1936,7 +1936,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
|
||||
"1: \n"
|
||||
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
|
||||
"subs %w4, %w4, #16 \n" // 16 pixels
|
||||
"orr v2.8b, v1.8b, v1.8b \n"
|
||||
"mov v2.8b, v1.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
|
||||
"ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
|
||||
@ -1959,7 +1959,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
|
||||
"orr v3.8b, v2.8b, v2.8b \n"
|
||||
"mov v3.8b, v2.8b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
|
||||
"ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
|
||||
@ -3666,8 +3666,8 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
"umlal v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v26.8b \n" // R
|
||||
"uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
|
||||
"orr v1.8b, v0.8b, v0.8b \n" // G
|
||||
"orr v2.8b, v0.8b, v0.8b \n" // R
|
||||
"mov v1.8b, v0.8b \n" // G
|
||||
"mov v2.8b, v0.8b \n" // R
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
@ -3879,9 +3879,9 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"uqadd v0.8b, v0.8b, v1.8b \n" // add
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"orr v1.8b, v0.8b, v0.8b \n"
|
||||
"mov v1.8b, v0.8b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"orr v2.8b, v0.8b, v0.8b \n"
|
||||
"mov v2.8b, v0.8b \n"
|
||||
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_sobelx), // %0
|
||||
|
||||
@ -163,7 +163,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||
"subs %w2, %w2, #24 \n"
|
||||
"orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
|
||||
"mov v2.16b, v3.16b \n" // order v0,v1,v2
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user