mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Fix for DivideRow_NEON functions
- was dup of 8h but mul of 4s. now use umull Bug: libyuv:951 Change-Id: If6cb01f5f006c2235886b81ce120642d7e24a9bb Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166563 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
541d8efbaf
commit
0faf8dd0e0
@ -3911,21 +3911,17 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.16 q6, %3 \n"
|
||||
"vdup.16 d8, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0, q1}, [%0]! \n"
|
||||
"vmovl.u16 q2, d0 \n"
|
||||
"vmovl.u16 q3, d1 \n"
|
||||
"vmovl.u16 q4, d2 \n"
|
||||
"vmovl.u16 q5, d3 \n"
|
||||
"vmul.u32 q2, q2, q6 \n"
|
||||
"vmul.u32 q3, q3, q6 \n"
|
||||
"vmul.u32 q4, q4, q6 \n"
|
||||
"vmul.u32 q5, q5, q6 \n"
|
||||
"vshrn.u32 d0, q2, #16 \n"
|
||||
"vshrn.u32 d1, q3, #16 \n"
|
||||
"vshrn.u32 d2, q4, #16 \n"
|
||||
"vshrn.u32 d3, q5, #16 \n"
|
||||
"vld1.16 {q2, q3}, [%0]! \n"
|
||||
"vmull.u16 q0, d4, d8 \n"
|
||||
"vmull.u16 q1, d5, d8 \n"
|
||||
"vmull.u16 q2, d6, d8 \n"
|
||||
"vmull.u16 q3, d7, d8 \n"
|
||||
"vshrn.u32 d0, q0, #16 \n"
|
||||
"vshrn.u32 d1, q1, #16 \n"
|
||||
"vshrn.u32 d2, q2, #16 \n"
|
||||
"vshrn.u32 d3, q3, #16 \n"
|
||||
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
@ -3933,7 +3929,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "d8");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
|
||||
@ -4461,22 +4461,18 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"dup v6.8h, %w3 \n"
|
||||
"dup v4.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"ushll v2.4s, v0.4h, #0 \n"
|
||||
"ushll2 v3.4s, v0.8h, #0 \n"
|
||||
"ushll v4.4s, v1.4h, #0 \n"
|
||||
"ushll2 v5.4s, v1.8h, #0 \n"
|
||||
"ldp q2, q3, [%0], #32 \n"
|
||||
"umull v0.4s, v2.4h, v4.4h \n"
|
||||
"umull2 v1.4s, v2.8h, v4.8h \n"
|
||||
"umull v2.4s, v3.4h, v4.4h \n"
|
||||
"umull2 v3.4s, v3.8h, v4.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mul v3.4s, v3.4s, v6.4s \n"
|
||||
"mul v4.4s, v4.4s, v6.4s \n"
|
||||
"mul v5.4s, v5.4s, v6.4s \n"
|
||||
"shrn v0.4h, v2.4s, #16 \n"
|
||||
"shrn2 v0.8h, v3.4s, #16 \n"
|
||||
"shrn v1.4h, v4.4s, #16 \n"
|
||||
"shrn2 v1.8h, v5.4s, #16 \n"
|
||||
"shrn v0.4h, v0.4s, #16 \n"
|
||||
"shrn2 v0.8h, v1.4s, #16 \n"
|
||||
"shrn v1.4h, v2.4s, #16 \n"
|
||||
"shrn2 v1.8h, v3.4s, #16 \n"
|
||||
"stp q0, q1, [%1], #32 \n" // store 16 pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"b.gt 1b \n"
|
||||
@ -4484,7 +4480,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user