Convert16To8 NEON

Pixel 3
Was C    I010ToI420_Opt (749 ms)
Now NEON I010ToI420_Opt (356 ms)

Pixel 4
Was C    I010ToI420_Opt (581 ms)
Now NEON I010ToI420_Opt (163 ms)

Bug: b/233233302, b/233634772
Change-Id: I60a84648a66f77d97c0a7822b29bd18b8e3a3355
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3661401
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-05-23 19:15:41 -07:00 committed by libyuv LUCI CQ
parent 715150b5aa
commit eb2c88e499
5 changed files with 98 additions and 21 deletions

View File

@ -441,6 +441,7 @@ extern "C" {
#define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON
#define HAS_BYTETOFLOATROW_NEON
#define HAS_CONVERT16TO8ROW_NEON
#define HAS_COPYROW_NEON
#define HAS_DETILEROW_NEON
#define HAS_DETILESPLITUVROW_NEON
@ -2596,6 +2597,14 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int width);
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
void Convert16To8Row_Any_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);

View File

@ -147,6 +147,14 @@ void Convert16To8Plane(const uint16_t* src_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_CONVERT16TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Convert16To8Row = Convert16To8Row_Any_NEON;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_NEON;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3;

View File

@ -1481,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
uint8_t,
31)
#endif
#ifdef HAS_CONVERT16TO8ROW_NEON
ANY11C(Convert16To8Row_Any_NEON,
Convert16To8Row_NEON,
2,
1,
uint16_t,
uint8_t,
15)
#endif
#ifdef HAS_CONVERT8TO16ROW_SSE2
ANY11C(Convert8To16Row_Any_SSE2,
Convert8To16Row_SSE2,

View File

@ -3599,7 +3599,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"vdup.16 q2, %2 \n"
"vdup.16 q2, %3 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
@ -3607,13 +3607,12 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
"vmul.u16 q1, q1, q2 \n"
"vst1.16 {q0}, [%1]! \n"
"vst1.16 {q1}, [%1]! \n"
"subs %3, %3, #16 \n" // 16 src pixels per loop
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(scale), // %2
"+r"(width) // %3
:
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2");
}
@ -3622,7 +3621,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"vdup.16 q0, %2 \n"
"vdup.16 q0, %3 \n"
"1: \n"
"vld1.16 {q1}, [%0]! \n"
"vld1.16 {q2}, [%0]! \n"
@ -3640,6 +3639,34 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"vmovn.u32 d5, q2 \n"
"vst1.16 {q1}, [%1]! \n"
"vst1.16 {q2}, [%1]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
asm volatile(
"vdup.16 q2, %2 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
"vqdmulh.s16 q0, q0, q2 \n"
"vqdmulh.s16 q1, q1, q2 \n"
"vqshrn.u16 d0, q0, #1 \n"
"vqshrn.u16 d1, q1, #1 \n"
"vst1.16 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
: "+r"(src_y), // %0
@ -3647,7 +3674,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"+r"(scale), // %2
"+r"(width) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
: "cc", "memory", "q0", "q1", "q2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

View File

@ -4070,21 +4070,19 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"dup v2.8h, %w2 \n"
"dup v2.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"mul v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"mul v1.8h, v1.8h, v2.8h \n"
"stp q0, q1, [%1] \n" // store 16 pixels
"add %1, %1, #32 \n"
"subs %w3, %w3, #16 \n" // 16 src pixels per loop
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(scale), // %2
"+r"(width) // %3
:
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "v0", "v1", "v2");
}
@ -4093,7 +4091,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"dup v0.8h, %w2 \n"
"dup v0.8h, %w3 \n"
"1: \n"
"ldp q1, q2, [%0], #32 \n"
"ushll v3.4s, v1.4h, #0 \n"
@ -4109,18 +4107,44 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"shrn v4.4h, v4.4s, #16 \n"
"shrn2 v3.8h, v1.4s, #16 \n"
"shrn2 v4.8h, v2.4s, #16 \n"
"stp q3, q3, [%1] \n" // store 16 pixels
"add %1, %1, #32 \n"
"subs %w3, %w3, #16 \n" // 16 src pixels per loop
"stp q3, q3, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(scale), // %2
"+r"(width) // %3
:
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
asm volatile(
"dup v2.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"sqdmulh v0.8h, v0.8h, v2.8h \n"
"sqdmulh v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"uqshrn v0.8b, v0.8h, #1 \n"
"uqshrn2 v0.16b, v1.8h, #1 \n"
"str q0, [%1], #16 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "v0", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus