Fix for DivideRow_NEON functions

- was dup of 8h but mul of 4s.  now use umull

Bug: libyuv:951
Change-Id: If6cb01f5f006c2235886b81ce120642d7e24a9bb
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166563
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2023-01-17 14:00:35 -08:00 committed by libyuv LUCI CQ
parent 541d8efbaf
commit 0faf8dd0e0
2 changed files with 22 additions and 30 deletions

View File

@ -3911,21 +3911,17 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"vdup.16 q6, %3 \n"
"vdup.16 d8, %3 \n"
"1: \n"
"vld1.16 {q0, q1}, [%0]! \n"
"vmovl.u16 q2, d0 \n"
"vmovl.u16 q3, d1 \n"
"vmovl.u16 q4, d2 \n"
"vmovl.u16 q5, d3 \n"
"vmul.u32 q2, q2, q6 \n"
"vmul.u32 q3, q3, q6 \n"
"vmul.u32 q4, q4, q6 \n"
"vmul.u32 q5, q5, q6 \n"
"vshrn.u32 d0, q2, #16 \n"
"vshrn.u32 d1, q3, #16 \n"
"vshrn.u32 d2, q4, #16 \n"
"vshrn.u32 d3, q5, #16 \n"
"vld1.16 {q2, q3}, [%0]! \n"
"vmull.u16 q0, d4, d8 \n"
"vmull.u16 q1, d5, d8 \n"
"vmull.u16 q2, d6, d8 \n"
"vmull.u16 q3, d7, d8 \n"
"vshrn.u32 d0, q0, #16 \n"
"vshrn.u32 d1, q1, #16 \n"
"vshrn.u32 d2, q2, #16 \n"
"vshrn.u32 d3, q3, #16 \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
"subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n"
@ -3933,7 +3929,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
: "cc", "memory", "q0", "q1", "q2", "q3", "d8");
}
// Use scale to convert lsb formats to msb, depending how many bits there are:

View File

@ -4461,22 +4461,18 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale,
int width) {
asm volatile(
"dup v6.8h, %w3 \n"
"dup v4.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"ushll v2.4s, v0.4h, #0 \n"
"ushll2 v3.4s, v0.8h, #0 \n"
"ushll v4.4s, v1.4h, #0 \n"
"ushll2 v5.4s, v1.8h, #0 \n"
"ldp q2, q3, [%0], #32 \n"
"umull v0.4s, v2.4h, v4.4h \n"
"umull2 v1.4s, v2.8h, v4.8h \n"
"umull v2.4s, v3.4h, v4.4h \n"
"umull2 v3.4s, v3.8h, v4.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"mul v2.4s, v2.4s, v6.4s \n"
"mul v3.4s, v3.4s, v6.4s \n"
"mul v4.4s, v4.4s, v6.4s \n"
"mul v5.4s, v5.4s, v6.4s \n"
"shrn v0.4h, v2.4s, #16 \n"
"shrn2 v0.8h, v3.4s, #16 \n"
"shrn v1.4h, v4.4s, #16 \n"
"shrn2 v1.8h, v5.4s, #16 \n"
"shrn v0.4h, v0.4s, #16 \n"
"shrn2 v0.8h, v1.4s, #16 \n"
"shrn v1.4h, v2.4s, #16 \n"
"shrn2 v1.8h, v3.4s, #16 \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
@ -4484,7 +4480,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
}
// Use scale to convert lsb formats to msb, depending how many bits there are: