mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Add ScaleARGBCols_NEON for ARM32/64
ARM32/64 NEON versions of ScaleARGBCols_NEON are implemented. BUG=319 TESTED=libyuvTest.* on ARM32/64 with Android R=fbarchard@google.com Change-Id: Id9ad97f7aa5d8a34cd55ace9e648cb6ff028efd9 Review URL: https://webrtc-codereview.appspot.com/47689004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1351 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
416c48dba7
commit
f23d6222ac
@ -66,6 +66,7 @@ extern "C" {
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEADDROWS_NEON
|
||||
#define HAS_SCALEFILTERCOLS_NEON
|
||||
#define HAS_SCALEARGBCOLS_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
@ -354,6 +355,12 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx);
|
||||
|
||||
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
|
||||
@ -33,6 +33,9 @@ extern "C" {
|
||||
#ifdef HAS_SCALEFILTERCOLS_NEON
|
||||
CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEARGBCOLS_NEON
|
||||
CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Fixed scale down.
|
||||
|
||||
@ -326,6 +326,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
@ -500,6 +508,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (!filtering && TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBFilterCols = ScaleARGBColsUp2_C;
|
||||
@ -607,6 +623,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
|
||||
if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBCols = ScaleARGBCols_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleARGBCols = ScaleARGBCols_Any_NEON;
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleARGBCols = ScaleARGBCols_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (src_width * 2 == dst_width && x < 0x8000) {
|
||||
ScaleARGBCols = ScaleARGBColsUp2_C;
|
||||
|
||||
@ -916,6 +916,49 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD1_DATA32_LANE(dn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl 2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"vld1.32 {"#dn"["#n"]}, [%6] \n"
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
int tmp;
|
||||
const uint8* src_tmp = src_argb;
|
||||
asm volatile (
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
LOAD1_DATA32_LANE(d0, 0)
|
||||
LOAD1_DATA32_LANE(d0, 1)
|
||||
LOAD1_DATA32_LANE(d1, 0)
|
||||
LOAD1_DATA32_LANE(d1, 1)
|
||||
LOAD1_DATA32_LANE(d2, 0)
|
||||
LOAD1_DATA32_LANE(d2, 1)
|
||||
LOAD1_DATA32_LANE(d3, 0)
|
||||
LOAD1_DATA32_LANE(d3, 1)
|
||||
|
||||
MEMACCESS(0)
|
||||
"vst1.32 {q0, q1}, [%0]! \n" // store pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(x), // %3
|
||||
"+r"(dx), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD1_DATA32_LANE
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -912,6 +912,49 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
|
||||
);
|
||||
}
|
||||
|
||||
// TODO(Yang Zhang): Investigate less load instructions for
|
||||
// the x/dx stepping
|
||||
#define LOAD1_DATA32_LANE(vn, n) \
|
||||
"lsr %5, %3, #16 \n" \
|
||||
"add %6, %1, %5, lsl 2 \n" \
|
||||
"add %3, %3, %4 \n" \
|
||||
MEMACCESS(6) \
|
||||
"ld1 {"#vn".s}["#n"], [%6] \n"
|
||||
|
||||
void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
|
||||
int dst_width, int x, int dx) {
|
||||
int tmp;
|
||||
const uint8* src_tmp = src_argb;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
LOAD1_DATA32_LANE(v0, 0)
|
||||
LOAD1_DATA32_LANE(v0, 1)
|
||||
LOAD1_DATA32_LANE(v0, 2)
|
||||
LOAD1_DATA32_LANE(v0, 3)
|
||||
LOAD1_DATA32_LANE(v1, 0)
|
||||
LOAD1_DATA32_LANE(v1, 1)
|
||||
LOAD1_DATA32_LANE(v1, 2)
|
||||
LOAD1_DATA32_LANE(v1, 3)
|
||||
|
||||
MEMACCESS(0)
|
||||
"st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(x), // %3
|
||||
"+r"(dx), // %4
|
||||
"+r"(tmp), // %5
|
||||
"+r"(src_tmp) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1"
|
||||
);
|
||||
}
|
||||
|
||||
#undef LOAD1_DATA32_LANE
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user