AArch64:add SumSquareError_NEON armv8 assembly version

BUG=none
TESTED=libyuv_unittest
R=fbarchard@google.com

Review URL: https://webrtc-codereview.appspot.com/16259004/

the benckmarking result is as follows:
toolchain: gcc 4.9
hardware: A53

| count | C Times/NEON times |
| 16    | 3.35               |
| 128   | 6.63               |
| 512   | 7.47               |
| 1024  | 7.72               |

Change-Id: Ic10bf22d77d069a1a2074b68bd5a310c579ec490

Review URL: https://webrtc-codereview.appspot.com/21159004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1043 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
yang.zhang@arm.com 2014-08-13 06:10:02 +00:00
parent 1afdfb3da8
commit 26f43db1ef
2 changed files with 40 additions and 1 deletions

View File

@ -80,7 +80,7 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif

View File

@ -56,6 +56,45 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
return sse;
}
#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
volatile uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
".p2align 2 \n"
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], #16 \n"
MEMACCESS(1)
"ld1 {v1.16b}, [%1], #16 \n"
"subs %2, %2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"bgt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}
#endif // __ARM_NEON__
#ifdef __cplusplus