From 0faf8dd0e004520a61a603a4d2996d5ecc80dc3f Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 17 Jan 2023 14:00:35 -0800 Subject: [PATCH] Fix for DivideRow_NEON functions - was dup of 8h but mul of 4s. now use umull Bug: libyuv:951 Change-Id: If6cb01f5f006c2235886b81ce120642d7e24a9bb Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166563 Reviewed-by: Justin Green Commit-Queue: Frank Barchard --- source/row_neon.cc | 26 +++++++++++--------------- source/row_neon64.cc | 26 +++++++++++--------------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/source/row_neon.cc b/source/row_neon.cc index 416f112fc..37f6db0cd 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3911,21 +3911,17 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q6, %3 \n" + "vdup.16 d8, %3 \n" "1: \n" - "vld1.16 {q0, q1}, [%0]! \n" - "vmovl.u16 q2, d0 \n" - "vmovl.u16 q3, d1 \n" - "vmovl.u16 q4, d2 \n" - "vmovl.u16 q5, d3 \n" - "vmul.u32 q2, q2, q6 \n" - "vmul.u32 q3, q3, q6 \n" - "vmul.u32 q4, q4, q6 \n" - "vmul.u32 q5, q5, q6 \n" - "vshrn.u32 d0, q2, #16 \n" - "vshrn.u32 d1, q3, #16 \n" - "vshrn.u32 d2, q4, #16 \n" - "vshrn.u32 d3, q5, #16 \n" + "vld1.16 {q2, q3}, [%0]! \n" + "vmull.u16 q0, d4, d8 \n" + "vmull.u16 q1, d5, d8 \n" + "vmull.u16 q2, d6, d8 \n" + "vmull.u16 q3, d7, d8 \n" + "vshrn.u32 d0, q0, #16 \n" + "vshrn.u32 d1, q1, #16 \n" + "vshrn.u32 d2, q2, #16 \n" + "vshrn.u32 d3, q3, #16 \n" "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" @@ -3933,7 +3929,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6"); + : "cc", "memory", "q0", "q1", "q2", "q3", "d8"); } // Use scale to convert lsb formats to msb, depending how many bits there are: diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 088174c87..7f04b6068 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -4461,22 +4461,18 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "dup v6.8h, %w3 \n" + "dup v4.8h, %w3 \n" "1: \n" - "ldp q0, q1, [%0], #32 \n" - "ushll v2.4s, v0.4h, #0 \n" - "ushll2 v3.4s, v0.8h, #0 \n" - "ushll v4.4s, v1.4h, #0 \n" - "ushll2 v5.4s, v1.8h, #0 \n" + "ldp q2, q3, [%0], #32 \n" + "umull v0.4s, v2.4h, v4.4h \n" + "umull2 v1.4s, v2.8h, v4.8h \n" + "umull v2.4s, v3.4h, v4.4h \n" + "umull2 v3.4s, v3.8h, v4.8h \n" "prfm pldl1keep, [%0, 448] \n" - "mul v2.4s, v2.4s, v6.4s \n" - "mul v3.4s, v3.4s, v6.4s \n" - "mul v4.4s, v4.4s, v6.4s \n" - "mul v5.4s, v5.4s, v6.4s \n" - "shrn v0.4h, v2.4s, #16 \n" - "shrn2 v0.8h, v3.4s, #16 \n" - "shrn v1.4h, v4.4s, #16 \n" - "shrn2 v1.8h, v5.4s, #16 \n" + "shrn v0.4h, v0.4s, #16 \n" + "shrn2 v0.8h, v1.4s, #16 \n" + "shrn v1.4h, v2.4s, #16 \n" + "shrn2 v1.8h, v3.4s, #16 \n" "stp q0, q1, [%1], #32 \n" // store 16 pixels "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" @@ -4484,7 +4480,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } // Use scale to convert lsb formats to msb, depending how many bits there are: