diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e06f65e56..7f25a34c5 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3810,8 +3810,8 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, asm volatile( // 8 pixel loop. "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB + "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "prfm pldl1keep, [%0, 448] \n" @@ -3819,7 +3819,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, "prfm pldl1keep, [%1, 448] \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1