From 2df5743bd4d3e128f486d859502607a3c9d86240 Mon Sep 17 00:00:00 2001 From: "ashok.bhat@gmail.com" Date: Fri, 29 Aug 2014 08:12:51 +0000 Subject: [PATCH] Row AArch64 Neon implementation - Part 6 BUG=319 TESTED=libyuv_unittest R=fbarchard@google.com Change-Id: I5d93eb184ba873d5e7637a3b5a830be39a967c6f Signed-off-by: Ashok Bhat Review URL: https://webrtc-codereview.appspot.com/15239004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1069 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 26 +- include/libyuv/version.h | 2 +- source/convert.cc | 62 ++--- source/row_any.cc | 36 +++ source/row_neon64.cc | 525 ++++++++++++++++++++------------------- 6 files changed, 351 insertions(+), 302 deletions(-) diff --git a/README.chromium b/README.chromium index ce303a717..b57517f79 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1067 +Version: 1069 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 44e63a904..477b27447 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -319,19 +319,19 @@ extern "C" { // #define HAS_RGB565TOYROW_NEON // #define HAS_ARGB1555TOYROW_NEON // #define HAS_ARGB4444TOYROW_NEON -// #define HAS_BGRATOYROW_NEON -// #define HAS_ABGRTOYROW_NEON -// #define HAS_RGBATOYROW_NEON -// #define HAS_RGB24TOYROW_NEON -// #define HAS_RAWTOYROW_NEON -// #define HAS_INTERPOLATEROW_NEON -// #define HAS_ARGBBLENDROW_NEON -// #define HAS_ARGBATTENUATEROW_NEON -// #define HAS_ARGBQUANTIZEROW_NEON -// #define HAS_ARGBSHADEROW_NEON -// #define HAS_ARGBGRAYROW_NEON -// #define HAS_ARGBSEPIAROW_NEON -// #define HAS_ARGBCOLORMATRIXROW_NEON +#define HAS_BGRATOYROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_RGBATOYROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RAWTOYROW_NEON +#define HAS_INTERPOLATEROW_NEON +#define HAS_ARGBBLENDROW_NEON +#define HAS_ARGBATTENUATEROW_NEON +#define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSHADEROW_NEON +#define HAS_ARGBGRAYROW_NEON +#define HAS_ARGBSEPIAROW_NEON +#define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBADDROW_NEON #define HAS_ARGBSUBTRACTROW_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b176ec145..4ddd276fb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1067 +#define LIBYUV_VERSION 1069 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index a8e294f47..c31ecf263 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -782,13 +782,15 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_NEON; } - if (width >= 16) { + } +#endif +#if defined(HAS_BGRATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { BGRAToUVRow = BGRAToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_NEON; } } - } #endif for (y = 0; y < height - 1; y += 2) { @@ -851,11 +853,13 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } - if (width >= 16) { - ABGRToUVRow = ABGRToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ABGRToUVRow = ABGRToUVRow_NEON; - } + } +#endif +#if defined(HAS_ABGRTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; } } #endif @@ -920,11 +924,13 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_NEON; } - if (width >= 16) { - RGBAToUVRow = RGBAToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGBAToUVRow = RGBAToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGBATOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; } } #endif @@ -983,15 +989,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; } - if (width >= 16) { - RGB24ToUVRow = RGB24ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } + } +#endif +#if defined(HAS_RGB24TOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; } } -#else // HAS_RGB24TOYROW_NEON - +#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -1019,7 +1026,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RGB24TOYROW_NEON { #if !defined(HAS_RGB24TOYROW_NEON) @@ -1100,15 +1106,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; } - if (width >= 16) { - RAWToUVRow = RAWToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } + } +#endif +#if defined(HAS_RAWTOUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 16) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; } } -#else // HAS_RAWTOYROW_NEON - +#endif #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -1136,7 +1143,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } #endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RAWTOYROW_NEON { // Allocate 2 rows of ARGB. diff --git a/source/row_any.cc b/source/row_any.cc index ce8b3dad1..aaa0378d7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -245,14 +245,32 @@ YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16) #endif #ifdef HAS_ARGBTOYROW_NEON YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ARGBTOYJROW_NEON YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_BGRATOYROW_NEON YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_ABGRTOYROW_NEON YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGBATOYROW_NEON YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8) +#endif +#ifdef HAS_RGB24TOYROW_NEON YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RAWTOYROW_NEON YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) +#endif +#ifdef HAS_RGB565TOYROW_NEON YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB1555TOYROW_NEON YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_ARGB4444TOYROW_NEON YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) #endif #ifdef HAS_YUY2TOYROW_NEON @@ -342,14 +360,32 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +#endif +#ifdef HAS_ARGBTOUVJROW_NEON UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) +#endif +#ifdef HAS_BGRATOUVROW_NEON UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) +#endif +#ifdef HAS_ABGRTOUVROW_NEON UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGBATOUVROW_NEON UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) +#endif +#ifdef HAS_RGB24TOUVROW_NEON UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +#endif +#ifdef HAS_RAWTOUVROW_NEON UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +#endif +#ifdef HAS_RGB565TOUVROW_NEON UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVROW_NEON UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVROW_NEON UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) #endif #ifdef HAS_YUY2TOUVROW_NEON diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 55282f0db..952e10d73 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1,5 +1,5 @@ /* - * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * Copyright 2014 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -2466,28 +2466,28 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { #ifdef HAS_BGRATOYROW_NEON void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // R - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v1.8b, v4.8b \n" // R + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_BGRATOYROW_NEON @@ -2495,28 +2495,28 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { #ifdef HAS_ABGRTOYROW_NEON void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // R - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // B - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // R + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // B + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_abgr), // %0 - "+r"(dst_y), // %1 + "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_ABGRTOYROW_NEON @@ -2524,28 +2524,28 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { #ifdef HAS_RGBATOYROW_NEON void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d1, d4 \n" // B - "vmlal.u8 q8, d2, d5 \n" // G - "vmlal.u8 q8, d3, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v1.8b, v4.8b \n" // B + "umlal v16.8h, v2.8b, v5.8b \n" // G + "umlal v16.8h, v3.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 - "+r"(dst_y), // %1 + "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RGBATOYROW_NEON @@ -2553,28 +2553,28 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { #ifdef HAS_RGB24TOYROW_NEON void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "movi v4.8b, #13 \n" // B * 0.1016 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #33 \n" // R * 0.2578 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(dst_y), // %1 + "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RGB24TOYROW_NEON @@ -2582,28 +2582,28 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { #ifdef HAS_RAWTOYROW_NEON void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient - "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d7, #16 \n" // Add 16 constant + "movi v4.8b, #33 \n" // R * 0.2578 coefficient + "movi v5.8b, #65 \n" // G * 0.5078 coefficient + "movi v6.8b, #13 \n" // B * 0.1016 coefficient + "movi v7.8b, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q8, d0, d4 \n" // B - "vmlal.u8 q8, d1, d5 \n" // G - "vmlal.u8 q8, d2, d6 \n" // R - "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y - "vqadd.u8 d0, d7 \n" + "umull v16.8h, v0.8b, v4.8b \n" // B + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y + "uqadd v0.8b, v0.8b, v7.8b \n" MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 - "+r"(dst_y), // %1 - "+r"(pix) // %2 + "+r"(dst_y), // %1 + "+r"(pix) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } #endif // HAS_RAWTOYROW_NEON @@ -2613,10 +2613,12 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; asm volatile ( "cmp %4, #0 \n" "beq 100f \n" - "add %2, %1 \n" "cmp %4, #64 \n" "beq 75f \n" "cmp %4, #128 \n" @@ -2624,85 +2626,85 @@ void InterpolateRow_NEON(uint8* dst_ptr, "cmp %4, #192 \n" "beq 25f \n" - "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" - "vdup.8 d4, %4 \n" + "dup v5.16b, %w4 \n" + "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vmull.u8 q13, d0, d4 \n" - "vmull.u8 q14, d1, d4 \n" - "vmlal.u8 q13, d2, d5 \n" - "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" + "umull v2.8h, v0.8b, v4.8b \n" + "umull2 v3.8h, v0.16b, v4.16b \n" + "umlal v2.8h, v1.8b, v5.8b \n" + "umlal2 v3.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v2.8h, #8 \n" + "rshrn2 v0.16b, v3.8h, #8 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" + "st1 {v0.16b}, [%0], #16 \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" + "st1 {v0.16b}, [%0], #16 \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" + "ld1 {v1.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" + "st1 {v0.16b}, [%0], #16 \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" + "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" + "ld1 {v0.16b}, [%2], #16 \n" "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" + "st1 {v0.16b}, [%0], #16 \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" + "ld1 {v0.16b}, [%1], #16 \n" "subs %3, %3, #16 \n" MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" + "st1 {v0.16b}, [%0], #16 \n" "bgt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 + "+r"(src_ptr1), // %2 "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 + "+r"(y1_fraction), // %4 + "+r"(y0_fraction) // %5 : - : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" + : "cc", "memory", "v0", "v1", "v3", "v4", "v5" ); } #endif // HAS_INTERPOLATEROW_NEON @@ -2712,54 +2714,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "subs %3, #8 \n" + "subs %3, %3, #8 \n" "blt 89f \n" // Blend 8 pixels. "8: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. + "st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB. "bge 8b \n" "89: \n" - "adds %3, #8-1 \n" + "adds %3, %3, #8-1 \n" "blt 99f \n" // Blend 1 pixels. "1: \n" MEMACCESS(0) - "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + "ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. MEMACCESS(1) - "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. + "ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. - "vmull.u8 q10, d4, d3 \n" // db * a - "vmull.u8 q11, d5, d3 \n" // dg * a - "vmull.u8 q12, d6, d3 \n" // dr * a - "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 - "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 - "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 - "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 - "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 - "vqadd.u8 q0, q0, q2 \n" // + sbg - "vqadd.u8 d2, d2, d6 \n" // + sr - "vmov.u8 d3, #255 \n" // a = 255 + "umull v16.8h, v4.8b, v3.8b \n" // db * a + "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "umull v18.8h, v6.8b, v3.8b \n" // dr * a + "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 + "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 + "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 + "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) + "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) + "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) + "uqadd v0.8b, v0.8b, v4.8b \n" // + sb + "uqadd v1.8b, v1.8b, v5.8b \n" // + sg + "uqadd v2.8b, v2.8b, v6.8b \n" // + sr + "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. + "st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel. "bge 1b \n" "99: \n" @@ -2769,7 +2775,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "+r"(dst_argb), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18" ); } #endif // HAS_ARGBBLENDROW_NEON @@ -2781,22 +2788,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { // Attenuate 8 pixels. "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q10, d0, d3 \n" // b * a - "vmull.u8 q11, d1, d3 \n" // g * a - "vmull.u8 q12, d2, d3 \n" // r * a - "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 - "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 - "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + "umull v4.8h, v0.8b, v3.8b \n" // b * a + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q10", "q11", "q12" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBATTENUATEROW_NEON @@ -2807,41 +2814,41 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( - "vdup.u16 q8, %2 \n" - "vshr.u16 q8, q8, #1 \n" // scale >>= 1 - "vdup.u16 q9, %3 \n" // interval multiply. - "vdup.u16 q10, %4 \n" // interval add + "dup v4.8h, %w2 \n" + "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 + "dup v5.8h, %w3 \n" // interval multiply. + "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. + "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. - "vmovl.u8 q0, d0 \n" // b (0 .. 255) - "vmovl.u8 q1, d2 \n" - "vmovl.u8 q2, d4 \n" - "vqdmulh.s16 q0, q0, q8 \n" // b * scale - "vqdmulh.s16 q1, q1, q8 \n" // g - "vqdmulh.s16 q2, q2, q8 \n" // r - "vmul.u16 q0, q0, q9 \n" // b * interval_size - "vmul.u16 q1, q1, q9 \n" // g - "vmul.u16 q2, q2, q9 \n" // r - "vadd.u16 q0, q0, q10 \n" // b + interval_offset - "vadd.u16 q1, q1, q10 \n" // g - "vadd.u16 q2, q2, q10 \n" // r - "vqmovn.u16 d0, q0 \n" - "vqmovn.u16 d2, q1 \n" - "vqmovn.u16 d4, q2 \n" + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "uxtl v1.8h, v1.8b \n" + "uxtl v2.8h, v2.8b \n" + "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale + "sqdmulh v1.8h, v1.8h, v4.8h \n" // g + "sqdmulh v2.8h, v2.8h, v4.8h \n" // r + "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size + "mul v1.8h, v1.8h, v5.8h \n" // g + "mul v2.8h, v2.8h, v5.8h \n" // r + "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset + "add v1.8h, v1.8h, v6.8h \n" // g + "add v2.8h, v2.8h, v6.8h \n" // r + "uqxtn v0.8b, v0.8h \n" + "uqxtn v1.8b, v1.8h \n" + "uqxtn v2.8b, v2.8h \n" MEMACCESS(0) - "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. + "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } #endif // HAS_ARGBQUANTIZEROW_NEON @@ -2853,36 +2860,36 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( - "vdup.u32 q0, %3 \n" // duplicate scale value. - "vzip.u8 d0, d1 \n" // d0 aarrggbb. - "vshr.u16 q0, q0, #1 \n" // scale / 2. + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q10, d20 \n" // b (0 .. 255) - "vmovl.u8 q11, d22 \n" - "vmovl.u8 q12, d24 \n" - "vmovl.u8 q13, d26 \n" - "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 - "vqrdmulh.s16 q11, q11, d0[1] \n" // g - "vqrdmulh.s16 q12, q12, d0[2] \n" // r - "vqrdmulh.s16 q13, q13, d0[3] \n" // a - "vqmovn.u16 d20, q10 \n" - "vqmovn.u16 d22, q11 \n" - "vqmovn.u16 d24, q12 \n" - "vqmovn.u16 d26, q13 \n" + "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g + "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r + "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" MEMACCESS(1) - "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 - : "cc", "memory", "q0", "q10", "q11", "q12", "q13" + : "cc", "memory", "v0", "v4", "v5", "v6", "v7" ); } #endif // HAS_ARGBSHADEROW_NEON @@ -2893,28 +2900,28 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBGRAYROW_NEON void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient - "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient - "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient + "movi v24.8b, #15 \n" // B * 0.11400 coefficient + "movi v25.8b, #75 \n" // G * 0.58700 coefficient + "movi v26.8b, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlal.u8 q2, d1, d25 \n" // G - "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B - "vmov d1, d0 \n" // G - "vmov d2, d0 \n" // R + "umull v4.8h, v0.8b, v24.8b \n" // B + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R + "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B + "mov v1.8b, v0.8b \n" // G + "mov v2.8b, v0.8b \n" // R MEMACCESS(1) - "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : - : "cc", "memory", "q0", "q1", "q2", "q12", "q13" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" ); } #endif // HAS_ARGBGRAYROW_NEON @@ -2927,40 +2934,40 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBSEPIAROW_NEON void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d20, #17 \n" // BB coefficient - "vmov.u8 d21, #68 \n" // BG coefficient - "vmov.u8 d22, #35 \n" // BR coefficient - "vmov.u8 d24, #22 \n" // GB coefficient - "vmov.u8 d25, #88 \n" // GG coefficient - "vmov.u8 d26, #45 \n" // GR coefficient - "vmov.u8 d28, #24 \n" // BB coefficient - "vmov.u8 d29, #98 \n" // BG coefficient - "vmov.u8 d30, #50 \n" // BR coefficient + "movi v20.8b, #17 \n" // BB coefficient + "movi v21.8b, #68 \n" // BG coefficient + "movi v22.8b, #35 \n" // BR coefficient + "movi v24.8b, #22 \n" // GB coefficient + "movi v25.8b, #88 \n" // GG coefficient + "movi v26.8b, #45 \n" // GR coefficient + "movi v28.8b, #24 \n" // BB coefficient + "movi v29.8b, #98 \n" // BG coefficient + "movi v30.8b, #50 \n" // BR coefficient ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d20 \n" // B to Sepia B - "vmlal.u8 q2, d1, d21 \n" // G - "vmlal.u8 q2, d2, d22 \n" // R - "vmull.u8 q3, d0, d24 \n" // B to Sepia G - "vmlal.u8 q3, d1, d25 \n" // G - "vmlal.u8 q3, d2, d26 \n" // R - "vmull.u8 q8, d0, d28 \n" // B to Sepia R - "vmlal.u8 q8, d1, d29 \n" // G - "vmlal.u8 q8, d2, d30 \n" // R - "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B - "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G - "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R MEMACCESS(0) - "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : - : "cc", "memory", "q0", "q1", "q2", "q3", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" ); } #endif // HAS_ARGBSEPIAROW_NEON @@ -2973,60 +2980,60 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. - "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R,A coefficients s16. + "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. + "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. + "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. - "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit - "vmovl.u8 q9, d18 \n" // g - "vmovl.u8 q10, d20 \n" // r - "vmovl.u8 q15, d22 \n" // a - "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B - "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G - "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R - "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A - "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B - "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G - "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R - "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B - "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G - "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R - "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B - "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G - "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R - "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A - "vqadd.s16 q12, q12, q4 \n" // Accumulate B - "vqadd.s16 q13, q13, q5 \n" // Accumulate G - "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqadd.s16 q15, q15, q7 \n" // Accumulate A - "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R - "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "uxtl v17.8h, v17.8b \n" // g + "uxtl v18.8h, v18.8b \n" // r + "uxtl v19.8h, v19.8b \n" // a + "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B + "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G + "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R + "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A + "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B + "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G + "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R + "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B + "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G + "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R + "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B + "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G + "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R + "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A + "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B + "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G + "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R + "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A + "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B + "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G + "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R + "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A MEMACCESS(1) - "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", - "q10", "q11", "q12", "q13", "q14", "q15" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", + "v18", "v19", "v22", "v23", "v24", "v25" ); } #endif // HAS_ARGBCOLORMATRIXROW_NEON