mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Row AArch64 Neon implementation - Part 6
BUG=319 TESTED=libyuv_unittest R=fbarchard@google.com Change-Id: I5d93eb184ba873d5e7637a3b5a830be39a967c6f Signed-off-by: Ashok Bhat <ashok.bhat@arm.com> Review URL: https://webrtc-codereview.appspot.com/15239004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1069 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
4d5c3f3498
commit
2df5743bd4
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1067
|
||||
Version: 1069
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -319,19 +319,19 @@ extern "C" {
|
||||
// #define HAS_RGB565TOYROW_NEON
|
||||
// #define HAS_ARGB1555TOYROW_NEON
|
||||
// #define HAS_ARGB4444TOYROW_NEON
|
||||
// #define HAS_BGRATOYROW_NEON
|
||||
// #define HAS_ABGRTOYROW_NEON
|
||||
// #define HAS_RGBATOYROW_NEON
|
||||
// #define HAS_RGB24TOYROW_NEON
|
||||
// #define HAS_RAWTOYROW_NEON
|
||||
// #define HAS_INTERPOLATEROW_NEON
|
||||
// #define HAS_ARGBBLENDROW_NEON
|
||||
// #define HAS_ARGBATTENUATEROW_NEON
|
||||
// #define HAS_ARGBQUANTIZEROW_NEON
|
||||
// #define HAS_ARGBSHADEROW_NEON
|
||||
// #define HAS_ARGBGRAYROW_NEON
|
||||
// #define HAS_ARGBSEPIAROW_NEON
|
||||
// #define HAS_ARGBCOLORMATRIXROW_NEON
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_ABGRTOYROW_NEON
|
||||
#define HAS_RGBATOYROW_NEON
|
||||
#define HAS_RGB24TOYROW_NEON
|
||||
#define HAS_RAWTOYROW_NEON
|
||||
#define HAS_INTERPOLATEROW_NEON
|
||||
#define HAS_ARGBBLENDROW_NEON
|
||||
#define HAS_ARGBATTENUATEROW_NEON
|
||||
#define HAS_ARGBQUANTIZEROW_NEON
|
||||
#define HAS_ARGBSHADEROW_NEON
|
||||
#define HAS_ARGBGRAYROW_NEON
|
||||
#define HAS_ARGBSEPIAROW_NEON
|
||||
#define HAS_ARGBCOLORMATRIXROW_NEON
|
||||
#define HAS_ARGBMULTIPLYROW_NEON
|
||||
#define HAS_ARGBADDROW_NEON
|
||||
#define HAS_ARGBSUBTRACTROW_NEON
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1067
|
||||
#define LIBYUV_VERSION 1069
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -782,13 +782,15 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
BGRAToYRow = BGRAToYRow_NEON;
|
||||
}
|
||||
if (width >= 16) {
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_BGRATOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
|
||||
BGRAToUVRow = BGRAToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
BGRAToUVRow = BGRAToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
@ -851,11 +853,13 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ABGRToYRow = ABGRToYRow_NEON;
|
||||
}
|
||||
if (width >= 16) {
|
||||
ABGRToUVRow = ABGRToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ABGRToUVRow = ABGRToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ABGRTOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
|
||||
ABGRToUVRow = ABGRToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ABGRToUVRow = ABGRToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -920,11 +924,13 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RGBAToYRow = RGBAToYRow_NEON;
|
||||
}
|
||||
if (width >= 16) {
|
||||
RGBAToUVRow = RGBAToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGBAToUVRow = RGBAToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGBATOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
|
||||
RGBAToUVRow = RGBAToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGBAToUVRow = RGBAToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -983,15 +989,16 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RGB24ToYRow = RGB24ToYRow_NEON;
|
||||
}
|
||||
if (width >= 16) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#else // HAS_RGB24TOYROW_NEON
|
||||
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
|
||||
@ -1019,7 +1026,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
#endif // HAS_RGB24TOYROW_NEON
|
||||
|
||||
{
|
||||
#if !defined(HAS_RGB24TOYROW_NEON)
|
||||
@ -1100,15 +1106,16 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
RAWToYRow = RAWToYRow_NEON;
|
||||
}
|
||||
if (width >= 16) {
|
||||
RAWToUVRow = RAWToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RAWToUVRow = RAWToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RAWTOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
|
||||
RAWToUVRow = RAWToUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
RAWToUVRow = RAWToUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#else // HAS_RAWTOYROW_NEON
|
||||
|
||||
#endif
|
||||
#if defined(HAS_RAWTOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
|
||||
@ -1136,7 +1143,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
#endif // HAS_RAWTOYROW_NEON
|
||||
|
||||
{
|
||||
// Allocate 2 rows of ARGB.
|
||||
|
||||
@ -245,14 +245,32 @@ YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_NEON
|
||||
YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYJROW_NEON
|
||||
YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOYROW_NEON
|
||||
YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_ABGRTOYROW_NEON
|
||||
YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_RGBATOYROW_NEON
|
||||
YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOYROW_NEON
|
||||
YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOYROW_NEON
|
||||
YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOYROW_NEON
|
||||
YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOYROW_NEON
|
||||
YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOYROW_NEON
|
||||
YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOYROW_NEON
|
||||
@ -342,14 +360,32 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVROW_NEON
|
||||
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVJROW_NEON
|
||||
UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOUVROW_NEON
|
||||
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ABGRTOUVROW_NEON
|
||||
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGBATOUVROW_NEON
|
||||
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOUVROW_NEON
|
||||
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOUVROW_NEON
|
||||
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB565TOUVROW_NEON
|
||||
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGB1555TOUVROW_NEON
|
||||
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGB4444TOUVROW_NEON
|
||||
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUVROW_NEON
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
* Copyright 2014 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
@ -2466,28 +2466,28 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_BGRATOYROW_NEON
|
||||
void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of BGRA.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d1, d4 \n" // R
|
||||
"vmlal.u8 q8, d2, d5 \n" // G
|
||||
"vmlal.u8 q8, d3, d6 \n" // B
|
||||
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
|
||||
"vqadd.u8 d0, d7 \n"
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // B
|
||||
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
#endif // HAS_BGRATOYROW_NEON
|
||||
@ -2495,28 +2495,28 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_ABGRTOYROW_NEON
|
||||
void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ABGR.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // R
|
||||
"vmlal.u8 q8, d1, d5 \n" // G
|
||||
"vmlal.u8 q8, d2, d6 \n" // B
|
||||
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
|
||||
"vqadd.u8 d0, d7 \n"
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // R
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // B
|
||||
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ABGRTOYROW_NEON
|
||||
@ -2524,28 +2524,28 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_RGBATOYROW_NEON
|
||||
void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of RGBA.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d1, d4 \n" // B
|
||||
"vmlal.u8 q8, d2, d5 \n" // G
|
||||
"vmlal.u8 q8, d3, d6 \n" // R
|
||||
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
|
||||
"vqadd.u8 d0, d7 \n"
|
||||
"umull v16.8h, v1.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v3.8b, v6.8b \n" // R
|
||||
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
#endif // HAS_RGBATOYROW_NEON
|
||||
@ -2553,28 +2553,28 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_RGB24TOYROW_NEON
|
||||
void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
|
||||
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // B
|
||||
"vmlal.u8 q8, d1, d5 \n" // G
|
||||
"vmlal.u8 q8, d2, d6 \n" // R
|
||||
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
|
||||
"vqadd.u8 d0, d7 \n"
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_rgb24), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
#endif // HAS_RGB24TOYROW_NEON
|
||||
@ -2582,28 +2582,28 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_RAWTOYROW_NEON
|
||||
void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
|
||||
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
||||
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
|
||||
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
|
||||
"movi v6.8b, #13 \n" // B * 0.1016 coefficient
|
||||
"movi v7.8b, #16 \n" // Add 16 constant
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
|
||||
"ld3 {v0.8b-v2.8b}, [%0], #24 \n" // load 8 pixels of RAW.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q8, d0, d4 \n" // B
|
||||
"vmlal.u8 q8, d1, d5 \n" // G
|
||||
"vmlal.u8 q8, d2, d6 \n" // R
|
||||
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
|
||||
"vqadd.u8 d0, d7 \n"
|
||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
||||
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
|
||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
||||
MEMACCESS(1)
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
#endif // HAS_RAWTOYROW_NEON
|
||||
@ -2613,10 +2613,12 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
|
||||
void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint8* src_ptr1 = src_ptr + src_stride;
|
||||
asm volatile (
|
||||
"cmp %4, #0 \n"
|
||||
"beq 100f \n"
|
||||
"add %2, %1 \n"
|
||||
"cmp %4, #64 \n"
|
||||
"beq 75f \n"
|
||||
"cmp %4, #128 \n"
|
||||
@ -2624,85 +2626,85 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
"cmp %4, #192 \n"
|
||||
"beq 25f \n"
|
||||
|
||||
"vdup.8 d5, %4 \n"
|
||||
"rsb %4, #256 \n"
|
||||
"vdup.8 d4, %4 \n"
|
||||
"dup v5.16b, %w4 \n"
|
||||
"dup v4.16b, %w5 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vmull.u8 q13, d0, d4 \n"
|
||||
"vmull.u8 q14, d1, d4 \n"
|
||||
"vmlal.u8 q13, d2, d5 \n"
|
||||
"vmlal.u8 q14, d3, d5 \n"
|
||||
"vrshrn.u16 d0, q13, #8 \n"
|
||||
"vrshrn.u16 d1, q14, #8 \n"
|
||||
"umull v2.8h, v0.8b, v4.8b \n"
|
||||
"umull2 v3.8h, v0.16b, v4.16b \n"
|
||||
"umlal v2.8h, v1.8b, v5.8b \n"
|
||||
"umlal2 v3.8h, v1.16b, v5.16b \n"
|
||||
"rshrn v0.8b, v2.8h, #8 \n"
|
||||
"rshrn2 v0.16b, v3.8h, #8 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"bgt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 25 / 75.
|
||||
"25: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"bgt 25b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"vld1.8 {q1}, [%2]! \n"
|
||||
"ld1 {v1.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"bgt 50b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 75 / 25.
|
||||
"75: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
MEMACCESS(2)
|
||||
"vld1.8 {q0}, [%2]! \n"
|
||||
"ld1 {v0.16b}, [%2], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"vrhadd.u8 q0, q1 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"bgt 75b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
MEMACCESS(1)
|
||||
"vld1.8 {q0}, [%1]! \n"
|
||||
"ld1 {v0.16b}, [%1], #16 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {q0}, [%0]! \n"
|
||||
"st1 {v0.16b}, [%0], #16 \n"
|
||||
"bgt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_stride), // %2
|
||||
"+r"(src_ptr1), // %2
|
||||
"+r"(dst_width), // %3
|
||||
"+r"(source_y_fraction) // %4
|
||||
"+r"(y1_fraction), // %4
|
||||
"+r"(y0_fraction) // %5
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
|
||||
: "cc", "memory", "v0", "v1", "v3", "v4", "v5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_NEON
|
||||
@ -2712,54 +2714,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
|
||||
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"subs %3, #8 \n"
|
||||
"subs %3, %3, #8 \n"
|
||||
"blt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
"8: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB0.
|
||||
MEMACCESS(1)
|
||||
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
|
||||
"ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load 8 pixels of ARGB1.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q10, d4, d3 \n" // db * a
|
||||
"vmull.u8 q11, d5, d3 \n" // dg * a
|
||||
"vmull.u8 q12, d6, d3 \n" // dr * a
|
||||
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
|
||||
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
|
||||
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
|
||||
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
|
||||
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
|
||||
"vqadd.u8 q0, q0, q2 \n" // + sbg
|
||||
"vqadd.u8 d2, d2, d6 \n" // + sr
|
||||
"vmov.u8 d3, #255 \n" // a = 255
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
|
||||
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
|
||||
"uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
|
||||
"uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
|
||||
"uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
|
||||
"uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n" // + sb
|
||||
"uqadd v1.8b, v1.8b, v5.8b \n" // + sg
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
|
||||
"movi v3.8b, #255 \n" // a = 255
|
||||
MEMACCESS(2)
|
||||
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
|
||||
"st4 {v0.8b-v3.8b}, [%2], #32 \n" // store 8 pixels of ARGB.
|
||||
"bge 8b \n"
|
||||
|
||||
"89: \n"
|
||||
"adds %3, #8-1 \n"
|
||||
"adds %3, %3, #8-1 \n"
|
||||
"blt 99f \n"
|
||||
|
||||
// Blend 1 pixels.
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
|
||||
"ld4 {v0.b-v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
|
||||
MEMACCESS(1)
|
||||
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
|
||||
"ld4 {v4.b-v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
|
||||
"subs %3, %3, #1 \n" // 1 processed per loop.
|
||||
"vmull.u8 q10, d4, d3 \n" // db * a
|
||||
"vmull.u8 q11, d5, d3 \n" // dg * a
|
||||
"vmull.u8 q12, d6, d3 \n" // dr * a
|
||||
"vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
|
||||
"vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
|
||||
"vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
|
||||
"vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
|
||||
"vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
|
||||
"vqadd.u8 q0, q0, q2 \n" // + sbg
|
||||
"vqadd.u8 d2, d2, d6 \n" // + sr
|
||||
"vmov.u8 d3, #255 \n" // a = 255
|
||||
"umull v16.8h, v4.8b, v3.8b \n" // db * a
|
||||
"umull v17.8h, v5.8b, v3.8b \n" // dg * a
|
||||
"umull v18.8h, v6.8b, v3.8b \n" // dr * a
|
||||
"uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
|
||||
"uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
|
||||
"uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
|
||||
"uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
|
||||
"uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
|
||||
"uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
|
||||
"uqadd v0.8b, v0.8b, v4.8b \n" // + sb
|
||||
"uqadd v1.8b, v1.8b, v5.8b \n" // + sg
|
||||
"uqadd v2.8b, v2.8b, v6.8b \n" // + sr
|
||||
"movi v3.8b, #255 \n" // a = 255
|
||||
MEMACCESS(2)
|
||||
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
|
||||
"st4 {v0.b-v3.b}[0], [%2], #4 \n" // store 1 pixel.
|
||||
"bge 1b \n"
|
||||
|
||||
"99: \n"
|
||||
@ -2769,7 +2775,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"+r"(dst_argb), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_NEON
|
||||
@ -2781,22 +2788,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
// Attenuate 8 pixels.
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q10, d0, d3 \n" // b * a
|
||||
"vmull.u8 q11, d1, d3 \n" // g * a
|
||||
"vmull.u8 q12, d2, d3 \n" // r * a
|
||||
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
|
||||
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
|
||||
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
|
||||
"umull v4.8h, v0.8b, v3.8b \n" // b * a
|
||||
"umull v5.8h, v1.8b, v3.8b \n" // g * a
|
||||
"umull v6.8h, v2.8b, v3.8b \n" // r * a
|
||||
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
|
||||
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
|
||||
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
|
||||
MEMACCESS(1)
|
||||
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
|
||||
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q10", "q11", "q12"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBATTENUATEROW_NEON
|
||||
@ -2807,41 +2814,41 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
|
||||
int interval_offset, int width) {
|
||||
asm volatile (
|
||||
"vdup.u16 q8, %2 \n"
|
||||
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
|
||||
"vdup.u16 q9, %3 \n" // interval multiply.
|
||||
"vdup.u16 q10, %4 \n" // interval add
|
||||
"dup v4.8h, %w2 \n"
|
||||
"ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
|
||||
"dup v5.8h, %w3 \n" // interval multiply.
|
||||
"dup v6.8h, %w4 \n" // interval add
|
||||
|
||||
// 8 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
|
||||
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 pixels of ARGB.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
|
||||
"vmovl.u8 q1, d2 \n"
|
||||
"vmovl.u8 q2, d4 \n"
|
||||
"vqdmulh.s16 q0, q0, q8 \n" // b * scale
|
||||
"vqdmulh.s16 q1, q1, q8 \n" // g
|
||||
"vqdmulh.s16 q2, q2, q8 \n" // r
|
||||
"vmul.u16 q0, q0, q9 \n" // b * interval_size
|
||||
"vmul.u16 q1, q1, q9 \n" // g
|
||||
"vmul.u16 q2, q2, q9 \n" // r
|
||||
"vadd.u16 q0, q0, q10 \n" // b + interval_offset
|
||||
"vadd.u16 q1, q1, q10 \n" // g
|
||||
"vadd.u16 q2, q2, q10 \n" // r
|
||||
"vqmovn.u16 d0, q0 \n"
|
||||
"vqmovn.u16 d2, q1 \n"
|
||||
"vqmovn.u16 d4, q2 \n"
|
||||
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
|
||||
"uxtl v1.8h, v1.8b \n"
|
||||
"uxtl v2.8h, v2.8b \n"
|
||||
"sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
|
||||
"sqdmulh v1.8h, v1.8h, v4.8h \n" // g
|
||||
"sqdmulh v2.8h, v2.8h, v4.8h \n" // r
|
||||
"mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
|
||||
"mul v1.8h, v1.8h, v5.8h \n" // g
|
||||
"mul v2.8h, v2.8h, v5.8h \n" // r
|
||||
"add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
|
||||
"add v1.8h, v1.8h, v6.8h \n" // g
|
||||
"add v2.8h, v2.8h, v6.8h \n" // r
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"uqxtn v1.8b, v1.8h \n"
|
||||
"uqxtn v2.8b, v2.8h \n"
|
||||
MEMACCESS(0)
|
||||
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
|
||||
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 pixels of ARGB.
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
: "r"(scale), // %2
|
||||
"r"(interval_size), // %3
|
||||
"r"(interval_offset) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBQUANTIZEROW_NEON
|
||||
@ -2853,36 +2860,36 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
|
||||
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
"vdup.u32 q0, %3 \n" // duplicate scale value.
|
||||
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
|
||||
"vshr.u16 q0, q0, #1 \n" // scale / 2.
|
||||
"dup v0.4s, %w3 \n" // duplicate scale value.
|
||||
"zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
|
||||
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
|
||||
|
||||
// 8 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"ld4 {v4.8b-v7.8b}, [%0], #32 \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
|
||||
"vmovl.u8 q11, d22 \n"
|
||||
"vmovl.u8 q12, d24 \n"
|
||||
"vmovl.u8 q13, d26 \n"
|
||||
"vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
|
||||
"vqrdmulh.s16 q11, q11, d0[1] \n" // g
|
||||
"vqrdmulh.s16 q12, q12, d0[2] \n" // r
|
||||
"vqrdmulh.s16 q13, q13, d0[3] \n" // a
|
||||
"vqmovn.u16 d20, q10 \n"
|
||||
"vqmovn.u16 d22, q11 \n"
|
||||
"vqmovn.u16 d24, q12 \n"
|
||||
"vqmovn.u16 d26, q13 \n"
|
||||
"uxtl v4.8h, v4.8b \n" // b (0 .. 255)
|
||||
"uxtl v5.8h, v5.8b \n"
|
||||
"uxtl v6.8h, v6.8b \n"
|
||||
"uxtl v7.8h, v7.8b \n"
|
||||
"sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
|
||||
"sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
|
||||
"sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
|
||||
"sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
|
||||
"uqxtn v4.8b, v4.8h \n"
|
||||
"uqxtn v5.8b, v5.8h \n"
|
||||
"uqxtn v6.8b, v6.8h \n"
|
||||
"uqxtn v7.8b, v7.8h \n"
|
||||
MEMACCESS(1)
|
||||
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
|
||||
"st4 {v4.8b-v7.8b}, [%1], #32 \n" // store 8 pixels of ARGB.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(value) // %3
|
||||
: "cc", "memory", "q0", "q10", "q11", "q12", "q13"
|
||||
: "cc", "memory", "v0", "v4", "v5", "v6", "v7"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBSHADEROW_NEON
|
||||
@ -2893,28 +2900,28 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
#ifdef HAS_ARGBGRAYROW_NEON
|
||||
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
|
||||
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
|
||||
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
|
||||
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
|
||||
"movi v25.8b, #75 \n" // G * 0.58700 coefficient
|
||||
"movi v26.8b, #38 \n" // R * 0.29900 coefficient
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
"vmlal.u8 q2, d1, d25 \n" // G
|
||||
"vmlal.u8 q2, d2, d26 \n" // R
|
||||
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
|
||||
"vmov d1, d0 \n" // G
|
||||
"vmov d2, d0 \n" // R
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlal v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v26.8b \n" // R
|
||||
"sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
|
||||
"mov v1.8b, v0.8b \n" // G
|
||||
"mov v2.8b, v0.8b \n" // R
|
||||
MEMACCESS(1)
|
||||
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
|
||||
"st4 {v0.8b-v3.8b}, [%1], #32 \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q12", "q13"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBGRAYROW_NEON
|
||||
@ -2927,40 +2934,40 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
#ifdef HAS_ARGBSEPIAROW_NEON
|
||||
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"vmov.u8 d20, #17 \n" // BB coefficient
|
||||
"vmov.u8 d21, #68 \n" // BG coefficient
|
||||
"vmov.u8 d22, #35 \n" // BR coefficient
|
||||
"vmov.u8 d24, #22 \n" // GB coefficient
|
||||
"vmov.u8 d25, #88 \n" // GG coefficient
|
||||
"vmov.u8 d26, #45 \n" // GR coefficient
|
||||
"vmov.u8 d28, #24 \n" // BB coefficient
|
||||
"vmov.u8 d29, #98 \n" // BG coefficient
|
||||
"vmov.u8 d30, #50 \n" // BR coefficient
|
||||
"movi v20.8b, #17 \n" // BB coefficient
|
||||
"movi v21.8b, #68 \n" // BG coefficient
|
||||
"movi v22.8b, #35 \n" // BR coefficient
|
||||
"movi v24.8b, #22 \n" // GB coefficient
|
||||
"movi v25.8b, #88 \n" // GG coefficient
|
||||
"movi v26.8b, #45 \n" // GR coefficient
|
||||
"movi v28.8b, #24 \n" // BB coefficient
|
||||
"movi v29.8b, #98 \n" // BG coefficient
|
||||
"movi v30.8b, #50 \n" // BR coefficient
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
|
||||
"ld4 {v0.8b-v3.8b}, [%0] \n" // load 8 ARGB pixels.
|
||||
"subs %1, %1, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
|
||||
"vmlal.u8 q2, d1, d21 \n" // G
|
||||
"vmlal.u8 q2, d2, d22 \n" // R
|
||||
"vmull.u8 q3, d0, d24 \n" // B to Sepia G
|
||||
"vmlal.u8 q3, d1, d25 \n" // G
|
||||
"vmlal.u8 q3, d2, d26 \n" // R
|
||||
"vmull.u8 q8, d0, d28 \n" // B to Sepia R
|
||||
"vmlal.u8 q8, d1, d29 \n" // G
|
||||
"vmlal.u8 q8, d2, d30 \n" // R
|
||||
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
|
||||
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
|
||||
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
|
||||
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
|
||||
"umlal v4.8h, v1.8b, v21.8b \n" // G
|
||||
"umlal v4.8h, v2.8b, v22.8b \n" // R
|
||||
"umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
|
||||
"umlal v5.8h, v1.8b, v25.8b \n" // G
|
||||
"umlal v5.8h, v2.8b, v26.8b \n" // R
|
||||
"umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
|
||||
"umlal v6.8h, v1.8b, v29.8b \n" // G
|
||||
"umlal v6.8h, v2.8b, v30.8b \n" // R
|
||||
"uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
|
||||
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
|
||||
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
|
||||
MEMACCESS(0)
|
||||
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
|
||||
"st4 {v0.8b-v3.8b}, [%0], #32 \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(width) // %1
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBSEPIAROW_NEON
|
||||
@ -2973,60 +2980,60 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
|
||||
const int8* matrix_argb, int width) {
|
||||
asm volatile (
|
||||
MEMACCESS(3)
|
||||
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
|
||||
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
|
||||
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
|
||||
"ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
|
||||
"sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
|
||||
"sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
|
||||
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
MEMACCESS(0)
|
||||
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"ld4 {v16.8b-v19.8b}, [%0], #32 \n" // load 8 ARGB pixels.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
|
||||
"vmovl.u8 q9, d18 \n" // g
|
||||
"vmovl.u8 q10, d20 \n" // r
|
||||
"vmovl.u8 q15, d22 \n" // a
|
||||
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
|
||||
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
|
||||
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
|
||||
"vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
|
||||
"vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
|
||||
"vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
|
||||
"vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
|
||||
"vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
|
||||
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
|
||||
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
|
||||
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
|
||||
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
|
||||
"vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
|
||||
"vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
|
||||
"vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
|
||||
"vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
|
||||
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
|
||||
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
|
||||
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
|
||||
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
|
||||
"vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
|
||||
"vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
|
||||
"vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
|
||||
"vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
|
||||
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
|
||||
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
|
||||
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
|
||||
"vqadd.s16 q15, q15, q7 \n" // Accumulate A
|
||||
"vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
|
||||
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
|
||||
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
|
||||
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
|
||||
"uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
|
||||
"uxtl v17.8h, v17.8b \n" // g
|
||||
"uxtl v18.8h, v18.8b \n" // r
|
||||
"uxtl v19.8h, v19.8b \n" // a
|
||||
"mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
|
||||
"mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
|
||||
"mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
|
||||
"mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
|
||||
"mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
|
||||
"mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
|
||||
"mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
|
||||
"mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
|
||||
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
|
||||
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
|
||||
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
|
||||
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
|
||||
"mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
|
||||
"mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
|
||||
"mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
|
||||
"mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
|
||||
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
|
||||
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
|
||||
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
|
||||
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
|
||||
"mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
|
||||
"mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
|
||||
"mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
|
||||
"mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
|
||||
"sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
|
||||
"sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
|
||||
"sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
|
||||
"sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
|
||||
"sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
|
||||
"sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
|
||||
"sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
|
||||
"sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
|
||||
MEMACCESS(1)
|
||||
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
|
||||
"st4 {v16.8b-v19.8b}, [%1], #32 \n" // store 8 ARGB pixels.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(matrix_argb) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
|
||||
"q10", "q11", "q12", "q13", "q14", "q15"
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
|
||||
"v18", "v19", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBCOLORMATRIXROW_NEON
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user