mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Fix for divide row functions used by P010ToI010
Bug: libyuv:951 Change-Id: Id323656cb6f99b1be0be7aaa854d3cc15feeba69 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166562 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
d5aa3d4a76
commit
541d8efbaf
@ -24,7 +24,10 @@ namespace libyuv {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// subsample amount uses a shift.
|
// Subsample amount uses a shift.
|
||||||
|
// v is value
|
||||||
|
// a is amount to add to round up
|
||||||
|
// s is shift to subsample down
|
||||||
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
|
#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
|
||||||
static __inline int Abs(int v) {
|
static __inline int Abs(int v) {
|
||||||
return v >= 0 ? v : -v;
|
return v >= 0 ? v : -v;
|
||||||
@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
|
|||||||
if (width <= 0 || height == 0) {
|
if (width <= 0 || height == 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
|
ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
|
||||||
depth);
|
depth);
|
||||||
SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
|
SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
|
||||||
|
|||||||
@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
|||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.16 q0, %3 \n"
|
"vdup.16 q6, %3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q1}, [%0]! \n"
|
"vld1.16 {q0, q1}, [%0]! \n"
|
||||||
"vld1.16 {q2}, [%0]! \n"
|
"vmovl.u16 q2, d0 \n"
|
||||||
"vmovl.u16 q3, d2 \n"
|
"vmovl.u16 q3, d1 \n"
|
||||||
"vmovl.u16 q1, d3 \n"
|
"vmovl.u16 q4, d2 \n"
|
||||||
"vmovl.u16 q4, d4 \n"
|
"vmovl.u16 q5, d3 \n"
|
||||||
"vmovl.u16 q2, d5 \n"
|
"vmul.u32 q2, q2, q6 \n"
|
||||||
"vshl.u32 q3, q3, q0 \n"
|
"vmul.u32 q3, q3, q6 \n"
|
||||||
"vshl.u32 q4, q4, q0 \n"
|
"vmul.u32 q4, q4, q6 \n"
|
||||||
"vshl.u32 q1, q1, q0 \n"
|
"vmul.u32 q5, q5, q6 \n"
|
||||||
"vshl.u32 q2, q2, q0 \n"
|
"vshrn.u32 d0, q2, #16 \n"
|
||||||
"vmovn.u32 d2, q3 \n"
|
"vshrn.u32 d1, q3, #16 \n"
|
||||||
"vmovn.u32 d3, q1 \n"
|
"vshrn.u32 d2, q4, #16 \n"
|
||||||
"vmovn.u32 d4, q4 \n"
|
"vshrn.u32 d3, q5, #16 \n"
|
||||||
"vmovn.u32 d5, q2 \n"
|
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
|
||||||
"vst1.16 {q1}, [%1]! \n"
|
|
||||||
"vst1.16 {q2}, [%1]! \n"
|
|
||||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(scale) // %3
|
: "r"(scale) // %3
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||||
|
|||||||
@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
|||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"dup v0.8h, %w3 \n"
|
"dup v6.8h, %w3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ldp q1, q2, [%0], #32 \n"
|
"ldp q0, q1, [%0], #32 \n"
|
||||||
"ushll v3.4s, v1.4h, #0 \n"
|
"ushll v2.4s, v0.4h, #0 \n"
|
||||||
"ushll v4.4s, v2.4h, #0 \n"
|
"ushll2 v3.4s, v0.8h, #0 \n"
|
||||||
|
"ushll v4.4s, v1.4h, #0 \n"
|
||||||
|
"ushll2 v5.4s, v1.8h, #0 \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"ushll2 v1.4s, v1.8h, #0 \n"
|
"mul v2.4s, v2.4s, v6.4s \n"
|
||||||
"ushll2 v2.4s, v2.8h, #0 \n"
|
"mul v3.4s, v3.4s, v6.4s \n"
|
||||||
"mul v3.4s, v0.4s, v3.4s \n"
|
"mul v4.4s, v4.4s, v6.4s \n"
|
||||||
"mul v4.4s, v0.4s, v4.4s \n"
|
"mul v5.4s, v5.4s, v6.4s \n"
|
||||||
"mul v1.4s, v0.4s, v1.4s \n"
|
"shrn v0.4h, v2.4s, #16 \n"
|
||||||
"mul v2.4s, v0.4s, v2.4s \n"
|
"shrn2 v0.8h, v3.4s, #16 \n"
|
||||||
"shrn v3.4h, v3.4s, #16 \n"
|
"shrn v1.4h, v4.4s, #16 \n"
|
||||||
"shrn v4.4h, v4.4s, #16 \n"
|
"shrn2 v1.8h, v5.4s, #16 \n"
|
||||||
"shrn2 v3.8h, v1.4s, #16 \n"
|
"stp q0, q1, [%1], #32 \n" // store 16 pixels
|
||||||
"shrn2 v4.8h, v2.4s, #16 \n"
|
|
||||||
"stp q3, q3, [%1], #32 \n" // store 16 pixels
|
|
||||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(scale) // %3
|
: "r"(scale) // %3
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user