mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Add ScaleFilterCols_NEON for ARM32/64
ARM32/64 NEON versions of ScaleFilterCols_NEON are implemented. BUG=319 TESTED=libyuvTest.* on ARM32/64 with Android R=fbarchard@google.com Change-Id: I5b0838769ffb0182155d7cd6bcc520eb81eb5c4e Review URL: https://webrtc-codereview.appspot.com/41349004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1340 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
70e5c81860
commit
d6d7de5742
@ -203,6 +203,7 @@ extern "C" {
|
||||
#define HAS_I444TOARGBROW_AVX2
|
||||
#define HAS_I411TOARGBROW_AVX2
|
||||
#define HAS_J400TOARGBROW_AVX2
|
||||
#define HAS_J422TOARGBROW_AVX2
|
||||
// TODO(fbarchard): Port to Neon
|
||||
#define HAS_ARGBTORGB565DITHERROW_SSE2
|
||||
#define HAS_ARGBTORGB565DITHERROW_AVX2
|
||||
@ -233,7 +234,6 @@ extern "C" {
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
#define HAS_J422TOARGBROW_AVX2
|
||||
|
||||
// The following require HAS_I422TOARGBROW_AVX2
|
||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||
|
||||
@ -52,6 +52,7 @@ extern "C" {
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEADDROWS_NEON
|
||||
#define HAS_SCALEFILTERCOLS_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
@ -311,6 +312,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
|
||||
void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint16* dst_ptr, int src_width, int src_height);
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
@ -55,6 +55,7 @@
|
||||
'source/row_win.cc',
|
||||
'source/scale.cc',
|
||||
'source/scale_argb.cc',
|
||||
'source/scale_any.cc',
|
||||
'source/scale_common.cc',
|
||||
'source/scale_mips.cc',
|
||||
'source/scale_posix.cc',
|
||||
|
||||
@ -928,6 +928,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (y > max_y) {
|
||||
y = max_y;
|
||||
@ -1119,6 +1127,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
|
||||
if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEFILTERCOLS_NEON)
|
||||
if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) {
|
||||
ScaleFilterCols = ScaleFilterCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleFilterCols = ScaleFilterCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleFilterCols = ScaleColsUp2_C;
|
||||
|
||||
42
source/scale_any.cc
Normal file
42
source/scale_any.cc
Normal file
@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Copyright 2015 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
|
||||
#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \
|
||||
void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
|
||||
int dst_width, int x, int dx) { \
|
||||
int n = dst_width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \
|
||||
} \
|
||||
TERP_C(dst_ptr + n * BPP, src_ptr, \
|
||||
dst_width & MASK, x + n * dx, dx); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
@ -575,6 +575,73 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add r12, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"vld2.8 {d6["#n"], d7["#n"]}, [r12] \n"
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
int tmp[4] = {0, 1, 2, 3};
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"vdup.32 q0, %3 \n" // x
|
||||
"vdup.32 q1, %4 \n" // dx
|
||||
"vld1.32 {q2}, [%5] \n" // 0 1 2 3
|
||||
"vshl.i32 q3, q1, #2 \n" // 4 * dx
|
||||
"vmul.s32 q1, q1, q2 \n"
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"vadd.s32 q1, q1, q0 \n"
|
||||
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
||||
"vadd.s32 q2, q1, q3 \n"
|
||||
"vshl.i32 q0, q3, #1 \n" // 8 * dx
|
||||
"1: \n"
|
||||
LOAD2_DATA8_LANE(0)
|
||||
LOAD2_DATA8_LANE(1)
|
||||
LOAD2_DATA8_LANE(2)
|
||||
LOAD2_DATA8_LANE(3)
|
||||
LOAD2_DATA8_LANE(4)
|
||||
LOAD2_DATA8_LANE(5)
|
||||
LOAD2_DATA8_LANE(6)
|
||||
LOAD2_DATA8_LANE(7)
|
||||
"vmov q10, q1 \n"
|
||||
"vmov q11, q2 \n"
|
||||
"vuzp.16 q10, q11 \n"
|
||||
"vmovl.u8 q8, d6 \n"
|
||||
"vmovl.u8 q9, d7 \n"
|
||||
"vsubl.s16 q11, d18, d16 \n"
|
||||
"vsubl.s16 q12, d19, d17 \n"
|
||||
"vmovl.u16 q13, d20 \n"
|
||||
"vmovl.u16 q10, d21 \n"
|
||||
"vmul.s32 q11, q11, q13 \n"
|
||||
"vmul.s32 q12, q12, q10 \n"
|
||||
"vshrn.s32 d18, q11, #16 \n"
|
||||
"vshrn.s32 d19, q12, #16 \n"
|
||||
"vadd.s16 q8, q8, q9 \n"
|
||||
"vmovn.s16 d6, q8 \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.8 {d6}, [%0]! \n" // store pixels
|
||||
"vadd.s32 q1, q1, q0 \n"
|
||||
"vadd.s32 q2, q2, q0 \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_ptr) // %0
|
||||
: "r"(src_ptr), // %1
|
||||
"r"(dst_width), // %2
|
||||
"r"(x), // %3
|
||||
"r"(dx), // %4
|
||||
"r"(tmp) // %5
|
||||
: "memory", "cc", "r12", "q0", "q1", "q2", "q3",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA8_LANE
|
||||
|
||||
// 16x2 -> 16x1
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
@ -578,6 +578,72 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD2_DATA8_LANE(n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add x12, %1, %5 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
"ld2 {v4.b, v5.b}["#n"], [x12] \n"
|
||||
|
||||
void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx) {
|
||||
int tmp[4] = {0, 1, 2, 3};
|
||||
asm volatile (
|
||||
"dup v0.4s, %w3 \n" // x
|
||||
"dup v1.4s, %w4 \n" // dx
|
||||
"ld1 {v2.4s}, [%5] \n" // 0 1 2 3
|
||||
"shl v3.4s, v1.4s, #2 \n" // 4 * dx
|
||||
"mul v1.4s, v1.4s, v2.4s \n"
|
||||
// x , x + 1 * dx, x + 2 * dx, x + 3 * dx
|
||||
"add v1.4s, v1.4s, v0.4s \n"
|
||||
// x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
|
||||
"add v2.4s, v1.4s, v3.4s \n"
|
||||
"shl v0.4s, v3.4s, #1 \n" // 8 * dx
|
||||
"1: \n"
|
||||
LOAD2_DATA8_LANE(0)
|
||||
LOAD2_DATA8_LANE(1)
|
||||
LOAD2_DATA8_LANE(2)
|
||||
LOAD2_DATA8_LANE(3)
|
||||
LOAD2_DATA8_LANE(4)
|
||||
LOAD2_DATA8_LANE(5)
|
||||
LOAD2_DATA8_LANE(6)
|
||||
LOAD2_DATA8_LANE(7)
|
||||
"mov v6.16b, v1.16b \n"
|
||||
"mov v7.16b, v2.16b \n"
|
||||
"uzp1 v6.8h, v6.8h, v7.8h \n"
|
||||
"ushll v4.8h, v4.8b, #0 \n"
|
||||
"ushll v5.8h, v5.8b, #0 \n"
|
||||
"ssubl v16.4s, v5.4h, v4.4h \n"
|
||||
"ssubl2 v17.4s, v5.8h, v4.8h \n"
|
||||
"ushll v7.4s, v6.4h, #0 \n"
|
||||
"ushll2 v6.4s, v6.8h, #0 \n"
|
||||
"mul v16.4s, v16.4s, v7.4s \n"
|
||||
"mul v17.4s, v17.4s, v6.4s \n"
|
||||
"shrn v6.4h, v16.4s, #16 \n"
|
||||
"shrn2 v6.8h, v17.4s, #16 \n"
|
||||
"add v4.8h, v4.8h, v6.8h \n"
|
||||
"xtn v4.8b, v4.8h \n"
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v4.8b}, [%0], #8 \n" // store pixels
|
||||
"add v1.4s, v1.4s, v0.4s \n"
|
||||
"add v2.4s, v2.4s, v0.4s \n"
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_ptr) // %0
|
||||
: "r"(src_ptr), // %1
|
||||
"r"(dst_width), // %2
|
||||
"r"(static_cast<ptrdiff_t>(x)), // %3
|
||||
"r"(static_cast<ptrdiff_t>(dx)), // %4
|
||||
"r"(tmp) // %5
|
||||
: "memory", "cc", "x12", "v0", "v1", "v2", "v3",
|
||||
"v4", "v5", "v6", "v7", "v16", "v17"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD2_DATA8_LANE
|
||||
|
||||
// 16x2 -> 16x1
|
||||
void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user