From 686e9d024711edfdcc54f069338b15be73edc749 Mon Sep 17 00:00:00 2001 From: "zhongwei.yao@arm.com" Date: Mon, 1 Sep 2014 08:17:36 +0000 Subject: [PATCH] implement ARM64 ScaleARGBRowDownEven and ScaleARGBRowDownEvenBox TESTED=libyuv_unittest BUG=319 R=fbarchard@chromium.org, fbarchard@google.com Review URL: https://webrtc-codereview.appspot.com/19099004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1072 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/scale_row.h | 6 ++-- source/scale_neon64.cc | 66 ++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index c9d821003..70e6bc55b 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -57,8 +57,8 @@ extern "C" { #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON -/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */ #define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON #endif // The following are available on Mips platforms: @@ -267,10 +267,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); // Row functions. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 6c9156230..44df55c6c 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -705,26 +705,24 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( - "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.32 {d0[0]}, [%0], r12 \n" + "ld1 {v0.s}[0], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d0[1]}, [%0], r12 \n" + "ld1 {v0.s}[1], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d1[0]}, [%0], r12 \n" + "ld1 {v0.s}[2], [%0], %3 \n" MEMACCESS(0) - "vld1.32 {d1[1]}, [%0], r12 \n" + "ld1 {v0.s}[3], [%0], %3 \n" "subs %2, %2, #4 \n" // 4 pixels per loop. MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" + "st1 {v0.16b}, [%1], #16 \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" + : "r"(src_stepx * 4) // %3 + : "memory", "cc", "v0" ); } #endif //HAS_SCALEARGBROWDOWNEVEN_NEON @@ -732,50 +730,54 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. +// TODO, might be worth another optimization pass in future. +// It could be upgraded to 8 pixels at a time to start with. void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( - "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 MEMACCESS(1) - "vld1.8 {d1}, [%1], r12 \n" + "ld1 {v1.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d2}, [%0], r12 \n" + "ld1 {v2.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d3}, [%1], r12 \n" + "ld1 {v3.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d4}, [%0], r12 \n" + "ld1 {v4.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d5}, [%1], r12 \n" + "ld1 {v5.8b}, [%1], %4 \n" MEMACCESS(0) - "vld1.8 {d6}, [%0], r12 \n" + "ld1 {v6.8b}, [%0], %4 \n" MEMACCESS(1) - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "subs %3, %3, #4 \n" // 4 pixels per loop. MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" + "st1 {v0.16b}, [%2], #16 \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + : "r"(src_stepx * 4) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_SCALEARGBROWDOWNEVEN_NEON