From 9e223c3fc025e96ab6781cb3c85ab2a6827853df Mon Sep 17 00:00:00 2001 From: George Steed Date: Tue, 20 Feb 2024 12:56:29 +0000 Subject: [PATCH] [AArch64] Replace instances of ORR with MOV where possible The MOV instruction is an alias of ORR where both registers are the same and should be preferred. Both ORR and MOV are not zero-cost instructions on all micro-architectures so there may be better ways to express these kernels, but this is left for a later commit. Bug: libyuv:975 Change-Id: I29b7f182a57a61855cb7f8a867691080f153b10b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5332385 Reviewed-by: Frank Barchard --- source/row_neon.cc | 2 +- source/row_neon64.cc | 32 ++++++++++++++++---------------- source/scale_neon64.cc | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/source/row_neon.cc b/source/row_neon.cc index 31142a905..5a1ce50ac 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3822,7 +3822,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" - "vorr.u8 q2, q0, q0 \n" // move U after V + "vmov.u8 q2, q0 \n" // move U after V "subs %2, %2, #16 \n" // 16 pixels per loop "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels "bgt 1b \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index dd0e7b773..68e0d8da2 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -448,8 +448,8 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { "1: \n" "ld1 {v20.8b}, [%0], #8 \n" "prfm pldl1keep, [%0, 448] \n" - "orr v21.8b, v20.8b, v20.8b \n" - "orr v22.8b, v20.8b, v20.8b \n" + "mov v21.8b, v20.8b \n" + "mov v22.8b, v20.8b \n" "subs %w2, %w2, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -1561,9 +1561,9 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g + "mov v3.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "orr v4.8b, v0.8b, v0.8b \n" // move r + "mov v4.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1580,9 +1580,9 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g + "mov v2.8b, v4.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "orr v1.8b, v5.8b, v5.8b \n" // move r + "mov v1.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1598,9 +1598,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g + "mov v3.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "orr v4.8b, v0.8b, v0.8b \n" // move r + "mov v4.8b, v0.8b \n" // move r "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1739,9 +1739,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g + "mov v4.8b, v2.8b \n" // mov g "prfm pldl1keep, [%0, 448] \n" - "orr v5.8b, v1.8b, v1.8b \n" // mov b + "mov v5.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1936,7 +1936,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels - "orr v2.8b, v1.8b, v1.8b \n" + "mov v2.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs @@ -1959,7 +1959,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "orr v3.8b, v2.8b, v2.8b \n" + "mov v3.8b, v2.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs @@ -3666,8 +3666,8 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B - "orr v1.8b, v0.8b, v0.8b \n" // G - "orr v2.8b, v0.8b, v0.8b \n" // R + "mov v1.8b, v0.8b \n" // G + "mov v2.8b, v0.8b \n" // R "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -3879,9 +3879,9 @@ void SobelRow_NEON(const uint8_t* src_sobelx, "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add "prfm pldl1keep, [%0, 448] \n" - "orr v1.8b, v0.8b, v0.8b \n" + "mov v1.8b, v0.8b \n" "prfm pldl1keep, [%1, 448] \n" - "orr v2.8b, v0.8b, v0.8b \n" + "mov v2.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 491b6cf7a..12b4b4d09 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -163,7 +163,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 + "mov v2.16b, v3.16b \n" // order v0,v1,v2 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n"