mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Fix for divide row functions used by P010ToI010
Bug: libyuv:951 Change-Id: Id323656cb6f99b1be0be7aaa854d3cc15feeba69 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166562 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
d5aa3d4a76
commit
541d8efbaf
@ -24,7 +24,10 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// subsample amount uses a shift.
|
||||
// Subsample amount uses a shift.
|
||||
// v is value
|
||||
// a is amount to add to round up
|
||||
// s is shift to subsample down
|
||||
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
|
||||
static __inline int Abs(int v) {
|
||||
return v >= 0 ? v : -v;
|
||||
@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
|
||||
if (width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
|
||||
depth);
|
||||
SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
|
||||
|
||||
@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.16 q0, %3 \n"
|
||||
"vdup.16 q6, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vld1.16 {q2}, [%0]! \n"
|
||||
"vmovl.u16 q3, d2 \n"
|
||||
"vmovl.u16 q1, d3 \n"
|
||||
"vmovl.u16 q4, d4 \n"
|
||||
"vmovl.u16 q2, d5 \n"
|
||||
"vshl.u32 q3, q3, q0 \n"
|
||||
"vshl.u32 q4, q4, q0 \n"
|
||||
"vshl.u32 q1, q1, q0 \n"
|
||||
"vshl.u32 q2, q2, q0 \n"
|
||||
"vmovn.u32 d2, q3 \n"
|
||||
"vmovn.u32 d3, q1 \n"
|
||||
"vmovn.u32 d4, q4 \n"
|
||||
"vmovn.u32 d5, q2 \n"
|
||||
"vst1.16 {q1}, [%1]! \n"
|
||||
"vst1.16 {q2}, [%1]! \n"
|
||||
"vld1.16 {q0, q1}, [%0]! \n"
|
||||
"vmovl.u16 q2, d0 \n"
|
||||
"vmovl.u16 q3, d1 \n"
|
||||
"vmovl.u16 q4, d2 \n"
|
||||
"vmovl.u16 q5, d3 \n"
|
||||
"vmul.u32 q2, q2, q6 \n"
|
||||
"vmul.u32 q3, q3, q6 \n"
|
||||
"vmul.u32 q4, q4, q6 \n"
|
||||
"vmul.u32 q5, q5, q6 \n"
|
||||
"vshrn.u32 d0, q2, #16 \n"
|
||||
"vshrn.u32 d1, q3, #16 \n"
|
||||
"vshrn.u32 d2, q4, #16 \n"
|
||||
"vshrn.u32 d3, q5, #16 \n"
|
||||
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
|
||||
@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"dup v0.8h, %w3 \n"
|
||||
"dup v6.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q1, q2, [%0], #32 \n"
|
||||
"ushll v3.4s, v1.4h, #0 \n"
|
||||
"ushll v4.4s, v2.4h, #0 \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"ushll v2.4s, v0.4h, #0 \n"
|
||||
"ushll2 v3.4s, v0.8h, #0 \n"
|
||||
"ushll v4.4s, v1.4h, #0 \n"
|
||||
"ushll2 v5.4s, v1.8h, #0 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"ushll2 v1.4s, v1.8h, #0 \n"
|
||||
"ushll2 v2.4s, v2.8h, #0 \n"
|
||||
"mul v3.4s, v0.4s, v3.4s \n"
|
||||
"mul v4.4s, v0.4s, v4.4s \n"
|
||||
"mul v1.4s, v0.4s, v1.4s \n"
|
||||
"mul v2.4s, v0.4s, v2.4s \n"
|
||||
"shrn v3.4h, v3.4s, #16 \n"
|
||||
"shrn v4.4h, v4.4s, #16 \n"
|
||||
"shrn2 v3.8h, v1.4s, #16 \n"
|
||||
"shrn2 v4.8h, v2.4s, #16 \n"
|
||||
"stp q3, q3, [%1], #32 \n" // store 16 pixels
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mul v3.4s, v3.4s, v6.4s \n"
|
||||
"mul v4.4s, v4.4s, v6.4s \n"
|
||||
"mul v5.4s, v5.4s, v6.4s \n"
|
||||
"shrn v0.4h, v2.4s, #16 \n"
|
||||
"shrn2 v0.8h, v3.4s, #16 \n"
|
||||
"shrn v1.4h, v4.4s, #16 \n"
|
||||
"shrn2 v1.8h, v5.4s, #16 \n"
|
||||
"stp q0, q1, [%1], #32 \n" // store 16 pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user