From de9fa43c60840e3c3b0538f72bb67c6406b4acd7 Mon Sep 17 00:00:00 2001 From: "ashok.bhat@gmail.com" Date: Wed, 13 Aug 2014 08:33:17 +0000 Subject: [PATCH] Row AArch64 Neon implementation - Part 1 BUG=319 TEST=libyuv_unittest R=fbarchard@google.com Change-Id: I367ffa7bb0fd0337ab8486d3eb4fb94afea7400c Signed-off-by: Ashok Bhat Review URL: https://webrtc-codereview.appspot.com/21149004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1044 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 88 +++++++++++++++++++++++++++++++++ include/libyuv/version.h | 2 +- source/row_neon64.cc | 102 +++++++++++++++++++-------------------- 4 files changed, 141 insertions(+), 53 deletions(-) diff --git a/README.chromium b/README.chromium index 49e6a5c55..79a53e503 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1041 +Version: 1044 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9a79fda20..2895407e8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -252,6 +252,94 @@ extern "C" { // The following are available on arm64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +// #define HAS_I444TOARGBROW_NEON +// #define HAS_I422TOARGBROW_NEON +// #define HAS_I411TOARGBROW_NEON +// #define HAS_I422TOBGRAROW_NEON +// #define HAS_I422TOABGRROW_NEON +// #define HAS_I422TORGBAROW_NEON +// #define HAS_I422TORGB24ROW_NEON +// #define HAS_I422TORAWROW_NEON +// #define HAS_I422TORGB565ROW_NEON +// #define HAS_I422TOARGB1555ROW_NEON +// #define HAS_I422TOARGB4444ROW_NEON +// #define HAS_YTOARGBROW_NEON +// #define HAS_I400TOARGBROW_NEON +// #define HAS_NV12TOARGBROW_NEON +// #define HAS_NV21TOARGBROW_NEON +// #define HAS_NV12TORGB565ROW_NEON +// #define HAS_NV21TORGB565ROW_NEON +// #define HAS_YUY2TOARGBROW_NEON +// #define HAS_UYVYTOARGBROW_NEON +// #define HAS_SPLITUVROW_NEON +// #define HAS_MERGEUVROW_NEON +// #define HAS_COPYROW_NEON +// #define HAS_SETROW_NEON +// #define HAS_ARGBSETROWS_NEON +// #define HAS_MIRRORROW_NEON +// #define HAS_MIRRORUVROW_NEON +// #define HAS_ARGBMIRRORROW_NEON +// #define HAS_RGB24TOARGBROW_NEON +// #define HAS_RAWTOARGBROW_NEON +// #define HAS_RGB565TOARGBROW_NEON +// #define HAS_ARGB1555TOARGBROW_NEON +// #define HAS_ARGB4444TOARGBROW_NEON +// #define HAS_ARGBTORGB24ROW_NEON +// #define HAS_ARGBTORAWROW_NEON +// #define HAS_YUY2TOYROW_NEON +// #define HAS_UYVYTOYROW_NEON +// #define HAS_YUY2TOUV422ROW_NEON +// #define HAS_UYVYTOUV422ROW_NEON +// #define HAS_YUY2TOUVROW_NEON +// #define HAS_UYVYTOUVROW_NEON +// #define HAS_HALFROW_NEON +// #define HAS_ARGBTOBAYERROW_NEON +// #define HAS_ARGBTOBAYERGGROW_NEON +// #define HAS_ARGBSHUFFLEROW_NEON +// #define HAS_I422TOYUY2ROW_NEON +// #define HAS_I422TOUYVYROW_NEON +// #define HAS_ARGBTORGB565ROW_NEON +// #define HAS_ARGBTOARGB1555ROW_NEON +// #define HAS_ARGBTOARGB4444ROW_NEON +// #define HAS_ARGBTOYROW_NEON +// #define HAS_ARGBTOYJROW_NEON +// #define HAS_ARGBTOUV444ROW_NEON +// #define HAS_ARGBTOUV422ROW_NEON +// #define HAS_ARGBTOUV411ROW_NEON +// #define HAS_ARGBTOUVROW_NEON +// #define HAS_ARGBTOUVJROW_NEON +// #define HAS_BGRATOUVROW_NEON +// #define HAS_ABGRTOUVROW_NEON +// #define HAS_RGBATOUVROW_NEON +// #define HAS_RGB24TOUVROW_NEON +// #define HAS_RAWTOUVROW_NEON +// #define HAS_RGB565TOUVROW_NEON +// #define HAS_ARGB1555TOUVROW_NEON +// #define HAS_ARGB4444TOUVROW_NEON +// #define HAS_RGB565TOYROW_NEON +// #define HAS_ARGB1555TOYROW_NEON +// #define HAS_ARGB4444TOYROW_NEON +// #define HAS_BGRATOYROW_NEON +// #define HAS_ABGRTOYROW_NEON +// #define HAS_RGBATOYROW_NEON +// #define HAS_RGB24TOYROW_NEON +// #define HAS_RAWTOYROW_NEON +// #define HAS_INTERPOLATEROW_NEON +// #define HAS_ARGBBLENDROW_NEON +// #define HAS_ARGBATTENUATEROW_NEON +// #define HAS_ARGBQUANTIZEROW_NEON +// #define HAS_ARGBSHADEROW_NEON +// #define HAS_ARGBGRAYROW_NEON +// #define HAS_ARGBSEPIAROW_NEON +// #define HAS_ARGBCOLORMATRIXROW_NEON +// #define HAS_ARGBMULTIPLYROW_NEON +// #define HAS_ARGBADDROW_NEON +// #define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON #endif // The following are available on Neon platforms: diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 912c4c9e0..73d831889 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1041 +#define LIBYUV_VERSION 1044 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 46e9ceb33..b7238fb75 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3141,27 +3141,27 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d0, d0, d1 \n" // add - "vmov.u8 d1, d0 \n" - "vmov.u8 d2, d0 \n" + "uqadd v0.8b, v0.8b, v1.8b \n" // add + "mov v1.8b, v0.8b \n" + "mov v2.8b, v0.8b \n" MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELROW_NEON @@ -3175,20 +3175,20 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. - "vqadd.u8 q0, q0, q1 \n" // add + "uqadd v0.16b, v0.16b, v1.16b \n" // add MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1" ); } #endif // HAS_SOBELTOPLANEROW_NEON @@ -3202,25 +3202,25 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d3, #255 \n" // alpha + "movi v3.8b, #255 \n" // alpha // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. - "vqadd.u8 d1, d0, d2 \n" // add + "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_SOBELXYROW_NEON @@ -3236,28 +3236,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%5 \n" // top + "ld1 {v0.8b}, [%0],%5 \n" // top MEMACCESS(0) - "vld1.8 {d1}, [%0],%6 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%0],%6 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "ld1 {v2.8b}, [%1],%5 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%6 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%6 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(2) - "vld1.8 {d2}, [%2],%5 \n" // bottom + "ld1 {v2.8b}, [%2],%5 \n" // bottom MEMACCESS(2) - "vld1.8 {d3}, [%2],%6 \n" + "ld1 {v3.8b}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(3) - "vst1.8 {d0}, [%3]! \n" // store 8 sobelx + "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3266,7 +3266,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %4 : "r"(2), // %5 "r"(6) // %6 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELXROW_NEON @@ -3282,28 +3282,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld1.8 {d0}, [%0],%4 \n" // left + "ld1 {v0.8b}, [%0],%4 \n" // left MEMACCESS(1) - "vld1.8 {d1}, [%1],%4 \n" - "vsubl.u8 q0, d0, d1 \n" + "ld1 {v1.8b}, [%1],%4 \n" + "usubl v0.8h, v0.8b, v1.8b \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "ld1 {v2.8b}, [%0],%4 \n" // center * 2 MEMACCESS(1) - "vld1.8 {d3}, [%1],%4 \n" - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vadd.s16 q0, q0, q1 \n" + "ld1 {v3.8b}, [%1],%4 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "add v0.8h, v0.8h, v1.8h \n" MEMACCESS(0) - "vld1.8 {d2}, [%0],%5 \n" // right + "ld1 {v2.8b}, [%0],%5 \n" // right MEMACCESS(1) - "vld1.8 {d3}, [%1],%5 \n" + "ld1 {v3.8b}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels - "vsubl.u8 q1, d2, d3 \n" - "vadd.s16 q0, q0, q1 \n" - "vabs.s16 q0, q0 \n" - "vqmovn.u16 d0, q0 \n" + "usubl v1.8h, v2.8b, v3.8b \n" + "add v0.8h, v0.8h, v1.8h \n" + "abs v0.8h, v0.8h \n" + "uqxtn v0.8b, v0.8h \n" MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 sobely + "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -3311,7 +3311,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %3 : "r"(1), // %4 "r"(6) // %5 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_SOBELYROW_NEON