From f7d9b9fb13a0c50f94b25d49226435671f9ec1c5 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 15 Sep 2014 23:39:43 +0000 Subject: [PATCH] change vector range notation to a list of registers for clang compatibility. break compare into 2 neon files for consistency with other neon64 files. BUG=357 TESTED=local ios build R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30379004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1085 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- libyuv.gyp | 1 + source/compare_neon.cc | 39 ------------ source/compare_neon64.cc | 63 +++++++++++++++++++ source/row_neon64.cc | 130 +++++++++++++++++++-------------------- source/scale_neon64.cc | 42 ++++++------- 7 files changed, 152 insertions(+), 127 deletions(-) create mode 100644 source/compare_neon64.cc diff --git a/README.chromium b/README.chromium index da3161fb2..cb31b649d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1083 +Version: 1084 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e16b0dedf..5ed87e8df 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1083 +#define LIBYUV_VERSION 1084 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index 0f3cad462..52e7554fb 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -73,6 +73,7 @@ 'sources': [ # sources. 'source/compare_neon.cc', + 'source/compare_neon64.cc', 'source/rotate_neon.cc', 'source/rotate_neon64.cc', 'source/row_neon.cc', diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 0f62c6cb1..ef006ec41 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -57,45 +57,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { return sse; } -#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) - -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { - volatile uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" - - ".p2align 2 \n" - "1: \n" - MEMACCESS(0) - "ld1 {v0.16b}, [%0], #16 \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - "subs %2, %2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" - - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); - return sse; -} - #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc new file mode 100644 index 000000000..cc078f84c --- /dev/null +++ b/source/compare_neon64.cc @@ -0,0 +1,63 @@ +/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" + + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b}, [%0], #16 \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + "subs %2, %2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" + + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + return sse; +} + +#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index b6ab0ee52..6fe60b8c4 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -825,7 +825,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pairs of UV + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store U @@ -855,7 +855,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(2) - "st2 {v0.16b, v1.16b}, [%2], #32 \n" // store 16 pairs of UV + "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 @@ -875,10 +875,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld1 {v0.8b-v3.8b}, [%0], #32 \n" // load 32 + "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop MEMACCESS(1) - "st1 {v0.8b-v3.8b}, [%1], #32 \n" // store 32 + "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -1010,10 +1010,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 @@ -1031,12 +1031,12 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %2, %2, #8 \n" // 8 processed per loop. "mov v3.8b, v1.8b \n" // move g "mov v4.8b, v0.8b \n" // move r MEMACCESS(1) - "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a + "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 @@ -1170,10 +1170,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. + "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 @@ -1190,12 +1190,12 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a + "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %2, %2, #8 \n" // 8 processed per loop. "mov v4.8b, v2.8b \n" // mov g "mov v5.8b, v1.8b \n" // mov b MEMACCESS(1) - "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b + "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 @@ -1212,7 +1212,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. @@ -1232,7 +1232,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. @@ -1253,7 +1253,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v1.8b}, [%1], #8 \n" // store 8 U. @@ -1277,7 +1277,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v0.8b}, [%1], #8 \n" // store 8 U. @@ -1302,10 +1302,10 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row YUY2. "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V MEMACCESS(2) @@ -1332,10 +1332,10 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row UYVY. "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V MEMACCESS(2) @@ -1388,7 +1388,7 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, "mov v2.s[0], %w3 \n" // selector "1: \n" MEMACCESS(0) - "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. + "ld1 {v0.16b,v1.16b}, [%0], 32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels @@ -1412,7 +1412,7 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. @@ -1467,7 +1467,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1489,7 +1489,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys + "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "mov v3.8b, v2.8b \n" MEMACCESS(1) "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us @@ -1497,7 +1497,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1586,7 +1586,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1614,7 +1614,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1646,7 +1646,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "umlsl v4.8h, v1.8b, v25.8b \n" // G @@ -1691,7 +1691,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. @@ -1741,12 +1741,12 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.16b-v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. MEMACCESS(0) - "ld4 {v4.16b-v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16 ARGB pixels. "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -2474,7 +2474,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R "umlal v16.8h, v2.8b, v5.8b \n" // G @@ -2503,7 +2503,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R "umlal v16.8h, v1.8b, v5.8b \n" // G @@ -2532,7 +2532,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B "umlal v16.8h, v2.8b, v5.8b \n" // G @@ -2561,7 +2561,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24. + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G @@ -2590,7 +2590,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW. + "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G @@ -2720,9 +2720,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 8 pixels. "8: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a @@ -2738,7 +2738,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "uqadd v2.8b, v2.8b, v6.8b \n" // + sr "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. "b.ge 8b \n" "89: \n" @@ -2748,9 +2748,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixels. "1: \n" MEMACCESS(0) - "ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%0], #4 \n" // load 1 pixel ARGB0. MEMACCESS(1) - "ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a @@ -2766,7 +2766,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "uqadd v2.8b, v2.8b, v6.8b \n" // + sr "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%2], #4 \n" // store 1 pixel. "b.ge 1b \n" "99: \n" @@ -2789,7 +2789,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // Attenuate 8 pixels. "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v5.8h, v1.8b, v3.8b \n" // g * a @@ -2798,7 +2798,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 MEMACCESS(1) - "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -2824,7 +2824,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v1.8h, v1.8b \n" @@ -2842,7 +2842,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "uqxtn v1.8b, v1.8h \n" "uqxtn v2.8b, v2.8h \n" MEMACCESS(0) - "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -2869,7 +2869,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v5.8h, v5.8b \n" @@ -2884,7 +2884,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "uqxtn v6.8b, v6.8h \n" "uqxtn v7.8b, v7.8h \n" MEMACCESS(1) - "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. + "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -2907,7 +2907,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "umlal v4.8h, v1.8b, v25.8b \n" // G @@ -2916,7 +2916,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "mov v1.8b, v0.8b \n" // G "mov v2.8b, v0.8b \n" // R MEMACCESS(1) - "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -2947,7 +2947,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umlal v4.8h, v1.8b, v21.8b \n" // G @@ -2962,7 +2962,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R MEMACCESS(0) - "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -2988,7 +2988,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v17.8h, v17.8b \n" // g @@ -3027,7 +3027,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A MEMACCESS(1) - "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3049,9 +3049,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v1.8h, v1.8b, v5.8b \n" // multiply G @@ -3062,7 +3062,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_argb0), // %0 @@ -3084,16 +3084,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_argb0), // %0 @@ -3115,16 +3115,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) - "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_argb0), // %0 @@ -3159,7 +3159,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "mov v1.8b, v0.8b \n" "mov v2.8b, v0.8b \n" MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -3218,7 +3218,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "subs %3, %3, #8 \n" // 8 processed per loop. "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) - "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index f4cab9762..90bc74817 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" // load even pixels into v0, odd into v1 MEMACCESS(0) - "ld2 {v0.16b, v1.16b}, [%0], #32 \n" + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %2, %2, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store odd pixels @@ -51,7 +51,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, %0 \n" "1: \n" MEMACCESS(0) - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc MEMACCESS(1) "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop @@ -80,7 +80,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" @@ -142,11 +142,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %2, %2, #24 \n" "mov v2.8b, v3.8b \n" // order v0, v1, v2 MEMACCESS(1) - "st3 {v0.8b-v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -166,9 +166,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "add %3, %3, %0 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) - "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %2, %2, #24 \n" // filter src line 0 with src line 1 @@ -205,7 +205,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "uqrshrn v2.8b, v16.8h, #2 \n" MEMACCESS(1) - "st3 {v0.8b-v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 @@ -228,9 +228,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "add %3, %3, %0 \n" "1: \n" MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) - "ld4 {v4.8b-v7.8b}, [%3], #32 \n" // src line 1 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %2, %2, #24 \n" // average src line 0 with src line 1 "urhadd v0.8b, v0.8b, v4.8b \n" @@ -252,7 +252,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "uqrshrn v2.8b, v4.8h, #2 \n" MEMACCESS(1) - "st3 {v0.8b-v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -285,9 +285,9 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, "ld1 {v3.16b}, [%3] \n" "1: \n" MEMACCESS(0) - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" "subs %2, %2, #12 \n" - "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" MEMACCESS(1) @@ -325,11 +325,11 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "ld4 {v4.8b-v7.8b}, [%3], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" MEMACCESS(4) - "ld4 {v16.8b-v19.8b}, [%4], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -451,9 +451,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 MEMACCESS(0) - "ld4 {v0.8b-v3.8b}, [%0], #32 \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "ld4 {v4.8b-v7.8b}, [%3], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -673,14 +673,14 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add %1, %1, %0 \n" "1: \n" MEMACCESS (0) - "ld4 {v0.16b - v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. MEMACCESS (1) - "ld4 {v16.16b - v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. @@ -690,7 +690,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "rshrn v2.8b, v2.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n" MEMACCESS (2) - "st4 {v0.8b - v3.8b}, [%2], #32 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r" (src_ptr), // %0 "+r" (src_stride), // %1