Fix for divide row functions used by P010ToI010

Bug: libyuv:951
Change-Id: Id323656cb6f99b1be0be7aaa854d3cc15feeba69
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166562
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2023-01-17 13:06:38 -08:00 committed by libyuv LUCI CQ
parent d5aa3d4a76
commit 541d8efbaf
3 changed files with 36 additions and 36 deletions

View File

@ -24,7 +24,10 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// subsample amount uses a shift. // Subsample amount uses a shift.
// v is value
// a is amount to add to round up
// s is shift to subsample down
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
static __inline int Abs(int v) { static __inline int Abs(int v) {
return v >= 0 ? v : -v; return v >= 0 ? v : -v;
@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
if (width <= 0 || height == 0) { if (width <= 0 || height == 0) {
return -1; return -1;
} }
ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
depth); depth);
SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,

View File

@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale, int scale,
int width) { int width) {
asm volatile( asm volatile(
"vdup.16 q0, %3 \n" "vdup.16 q6, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q1}, [%0]! \n" "vld1.16 {q0, q1}, [%0]! \n"
"vld1.16 {q2}, [%0]! \n" "vmovl.u16 q2, d0 \n"
"vmovl.u16 q3, d2 \n" "vmovl.u16 q3, d1 \n"
"vmovl.u16 q1, d3 \n" "vmovl.u16 q4, d2 \n"
"vmovl.u16 q4, d4 \n" "vmovl.u16 q5, d3 \n"
"vmovl.u16 q2, d5 \n" "vmul.u32 q2, q2, q6 \n"
"vshl.u32 q3, q3, q0 \n" "vmul.u32 q3, q3, q6 \n"
"vshl.u32 q4, q4, q0 \n" "vmul.u32 q4, q4, q6 \n"
"vshl.u32 q1, q1, q0 \n" "vmul.u32 q5, q5, q6 \n"
"vshl.u32 q2, q2, q0 \n" "vshrn.u32 d0, q2, #16 \n"
"vmovn.u32 d2, q3 \n" "vshrn.u32 d1, q3, #16 \n"
"vmovn.u32 d3, q1 \n" "vshrn.u32 d2, q4, #16 \n"
"vmovn.u32 d4, q4 \n" "vshrn.u32 d3, q5, #16 \n"
"vmovn.u32 d5, q2 \n" "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
"vst1.16 {q1}, [%1]! \n"
"vst1.16 {q2}, [%1]! \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop "subs %2, %2, #16 \n" // 16 src pixels per loop
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(scale) // %3 : "r"(scale) // %3
: "cc", "memory", "q0", "q1", "q2", "q3", "q4"); : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
} }
// Use scale to convert lsb formats to msb, depending how many bits there are: // Use scale to convert lsb formats to msb, depending how many bits there are:

View File

@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
int scale, int scale,
int width) { int width) {
asm volatile( asm volatile(
"dup v0.8h, %w3 \n" "dup v6.8h, %w3 \n"
"1: \n" "1: \n"
"ldp q1, q2, [%0], #32 \n" "ldp q0, q1, [%0], #32 \n"
"ushll v3.4s, v1.4h, #0 \n" "ushll v2.4s, v0.4h, #0 \n"
"ushll v4.4s, v2.4h, #0 \n" "ushll2 v3.4s, v0.8h, #0 \n"
"ushll v4.4s, v1.4h, #0 \n"
"ushll2 v5.4s, v1.8h, #0 \n"
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"ushll2 v1.4s, v1.8h, #0 \n" "mul v2.4s, v2.4s, v6.4s \n"
"ushll2 v2.4s, v2.8h, #0 \n" "mul v3.4s, v3.4s, v6.4s \n"
"mul v3.4s, v0.4s, v3.4s \n" "mul v4.4s, v4.4s, v6.4s \n"
"mul v4.4s, v0.4s, v4.4s \n" "mul v5.4s, v5.4s, v6.4s \n"
"mul v1.4s, v0.4s, v1.4s \n" "shrn v0.4h, v2.4s, #16 \n"
"mul v2.4s, v0.4s, v2.4s \n" "shrn2 v0.8h, v3.4s, #16 \n"
"shrn v3.4h, v3.4s, #16 \n" "shrn v1.4h, v4.4s, #16 \n"
"shrn v4.4h, v4.4s, #16 \n" "shrn2 v1.8h, v5.4s, #16 \n"
"shrn2 v3.8h, v1.4s, #16 \n" "stp q0, q1, [%1], #32 \n" // store 16 pixels
"shrn2 v4.8h, v2.4s, #16 \n"
"stp q3, q3, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop "subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(scale) // %3 : "r"(scale) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
} }
// Use scale to convert lsb formats to msb, depending how many bits there are: // Use scale to convert lsb formats to msb, depending how many bits there are: