From f23d6222ac0d2829dcd26d28456de9991b4bb81d Mon Sep 17 00:00:00 2001 From: "yang.zhang@arm.com" Date: Tue, 31 Mar 2015 03:03:05 +0000 Subject: [PATCH] Add ScaleARGBCols_NEON for ARM32/64 ARM32/64 NEON versions of ScaleARGBCols_NEON are implemented. BUG=319 TESTED=libyuvTest.* on ARM32/64 with Android R=fbarchard@google.com Change-Id: Id9ad97f7aa5d8a34cd55ace9e648cb6ff028efd9 Review URL: https://webrtc-codereview.appspot.com/47689004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1351 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/scale_row.h | 7 +++++++ source/scale_any.cc | 3 +++ source/scale_argb.cc | 24 +++++++++++++++++++++ source/scale_neon.cc | 43 ++++++++++++++++++++++++++++++++++++++ source/scale_neon64.cc | 43 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 120 insertions(+) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 3652c8848..fa1b9e2bd 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -66,6 +66,7 @@ extern "C" { #define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEADDROWS_NEON #define HAS_SCALEFILTERCOLS_NEON +#define HAS_SCALEARGBCOLS_NEON #endif // The following are available on Mips platforms: @@ -354,6 +355,12 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale_any.cc b/source/scale_any.cc index 2b3ff4953..9099efaaa 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -33,6 +33,9 @@ extern "C" { #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif #undef CANY // Fixed scale down. diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 83713d554..6e019bf9b 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -326,6 +326,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -500,6 +508,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -607,6 +623,14 @@ static void ScaleARGBSimple(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBCols = ScaleARGBCols_SSE2; } +#endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 25098fc14..4386910f6 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -916,6 +916,49 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl 2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld1.32 {"#dn"["#n"]}, [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int tmp; + const uint8* src_tmp = src_argb; + asm volatile ( + ".p2align 2 \n" + "1: \n" + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + + MEMACCESS(0) + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1" + ); +} + +#undef LOAD1_DATA32_LANE + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 9f1cac490..aef26b13c 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -912,6 +912,49 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl 2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld1 {"#vn".s}["#n"], [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int tmp; + const uint8* src_tmp = src_argb; + asm volatile ( + "1: \n" + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + + MEMACCESS(0) + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1" + ); +} + +#undef LOAD1_DATA32_LANE + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus