mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Convert16To8 NEON
Pixel 3 Was C I010ToI420_Opt (749 ms) Now NEON I010ToI420_Opt (356 ms) Pixel 4 Was C I010ToI420_Opt (581 ms) Now NEON I010ToI420_Opt (163 ms) Bug: b/233233302, b/233634772 Change-Id: I60a84648a66f77d97c0a7822b29bd18b8e3a3355 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3661401 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
715150b5aa
commit
eb2c88e499
@ -441,6 +441,7 @@ extern "C" {
|
||||
#define HAS_BGRATOUVROW_NEON
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_BYTETOFLOATROW_NEON
|
||||
#define HAS_CONVERT16TO8ROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_DETILEROW_NEON
|
||||
#define HAS_DETILESPLITUVROW_NEON
|
||||
@ -2596,6 +2597,14 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_Any_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
|
||||
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
|
||||
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
|
||||
|
||||
@ -147,6 +147,14 @@ void Convert16To8Plane(const uint16_t* src_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_y = 0;
|
||||
}
|
||||
#if defined(HAS_CONVERT16TO8ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
Convert16To8Row = Convert16To8Row_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
Convert16To8Row = Convert16To8Row_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
Convert16To8Row = Convert16To8Row_Any_SSSE3;
|
||||
|
||||
@ -1481,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
|
||||
uint8_t,
|
||||
31)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT16TO8ROW_NEON
|
||||
ANY11C(Convert16To8Row_Any_NEON,
|
||||
Convert16To8Row_NEON,
|
||||
2,
|
||||
1,
|
||||
uint16_t,
|
||||
uint8_t,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_CONVERT8TO16ROW_SSE2
|
||||
ANY11C(Convert8To16Row_Any_SSE2,
|
||||
Convert8To16Row_SSE2,
|
||||
|
||||
@ -3599,7 +3599,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.16 q2, %2 \n"
|
||||
"vdup.16 q2, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
@ -3607,13 +3607,12 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||
"vmul.u16 q1, q1, q2 \n"
|
||||
"vst1.16 {q0}, [%1]! \n"
|
||||
"vst1.16 {q1}, [%1]! \n"
|
||||
"subs %3, %3, #16 \n" // 16 src pixels per loop
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(scale), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
@ -3622,7 +3621,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.16 q0, %2 \n"
|
||||
"vdup.16 q0, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vld1.16 {q2}, [%0]! \n"
|
||||
@ -3640,6 +3639,34 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"vmovn.u32 d5, q2 \n"
|
||||
"vst1.16 {q1}, [%1]! \n"
|
||||
"vst1.16 {q2}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.16 q2, %2 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vqdmulh.s16 q0, q0, q2 \n"
|
||||
"vqdmulh.s16 q1, q1, q2 \n"
|
||||
"vqshrn.u16 d0, q0, #1 \n"
|
||||
"vqshrn.u16 d1, q1, #1 \n"
|
||||
"vst1.16 {q0}, [%1]! \n"
|
||||
"subs %3, %3, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
@ -3647,7 +3674,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"+r"(scale), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||
|
||||
@ -4070,21 +4070,19 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"dup v2.8h, %w2 \n"
|
||||
"dup v2.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"mul v0.8h, v0.8h, v2.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mul v1.8h, v1.8h, v2.8h \n"
|
||||
"stp q0, q1, [%1] \n" // store 16 pixels
|
||||
"add %1, %1, #32 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 src pixels per loop
|
||||
"stp q0, q1, [%1], #32 \n" // store 16 pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(scale), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
|
||||
@ -4093,7 +4091,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"dup v0.8h, %w2 \n"
|
||||
"dup v0.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q1, q2, [%0], #32 \n"
|
||||
"ushll v3.4s, v1.4h, #0 \n"
|
||||
@ -4109,18 +4107,44 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"shrn v4.4h, v4.4s, #16 \n"
|
||||
"shrn2 v3.8h, v1.4s, #16 \n"
|
||||
"shrn2 v4.8h, v2.4s, #16 \n"
|
||||
"stp q3, q3, [%1] \n" // store 16 pixels
|
||||
"add %1, %1, #32 \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 src pixels per loop
|
||||
"stp q3, q3, [%1], #32 \n" // store 16 pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(scale), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"dup v2.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"sqdmulh v0.8h, v0.8h, v2.8h \n"
|
||||
"sqdmulh v1.8h, v1.8h, v2.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"uqshrn v0.8b, v0.8h, #1 \n"
|
||||
"uqshrn2 v0.16b, v1.8h, #1 \n"
|
||||
"str q0, [%1], #16 \n" // store 16 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user