From 0faf8dd0e004520a61a603a4d2996d5ecc80dc3f Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 17 Jan 2023 14:00:35 -0800
Subject: [PATCH] Fix for DivideRow_NEON functions

- was dup of 8h but mul of 4s.  now use umull

Bug: libyuv:951
Change-Id: If6cb01f5f006c2235886b81ce120642d7e24a9bb
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166563
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 source/row_neon.cc   | 26 +++++++++++---------------
 source/row_neon64.cc | 26 +++++++++++---------------
 2 files changed, 22 insertions(+), 30 deletions(-)

diff --git a/source/row_neon.cc b/source/row_neon.cc
index 416f112fc..37f6db0cd 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -3911,21 +3911,17 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "vdup.16     q6, %3                        \n"
+      "vdup.16     d8, %3                        \n"
       "1:                                        \n"
-      "vld1.16     {q0, q1}, [%0]!               \n"
-      "vmovl.u16   q2, d0                        \n"
-      "vmovl.u16   q3, d1                        \n"
-      "vmovl.u16   q4, d2                        \n"
-      "vmovl.u16   q5, d3                        \n"
-      "vmul.u32    q2, q2, q6                    \n"
-      "vmul.u32    q3, q3, q6                    \n"
-      "vmul.u32    q4, q4, q6                    \n"
-      "vmul.u32    q5, q5, q6                    \n"
-      "vshrn.u32   d0, q2, #16                   \n"
-      "vshrn.u32   d1, q3, #16                   \n"
-      "vshrn.u32   d2, q4, #16                   \n"
-      "vshrn.u32   d3, q5, #16                   \n"
+      "vld1.16     {q2, q3}, [%0]!               \n"
+      "vmull.u16   q0, d4, d8                    \n"
+      "vmull.u16   q1, d5, d8                    \n"
+      "vmull.u16   q2, d6, d8                    \n"
+      "vmull.u16   q3, d7, d8                    \n"
+      "vshrn.u32   d0, q0, #16                   \n"
+      "vshrn.u32   d1, q1, #16                   \n"
+      "vshrn.u32   d2, q2, #16                   \n"
+      "vshrn.u32   d3, q3, #16                   \n"
       "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
       "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
       "bgt         1b                            \n"
@@ -3933,7 +3929,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are:
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index 088174c87..7f04b6068 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4461,22 +4461,18 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                        int scale,
                        int width) {
   asm volatile(
-      "dup         v6.8h, %w3                    \n"
+      "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"
-      "ushll       v2.4s, v0.4h, #0              \n"
-      "ushll2      v3.4s, v0.8h, #0              \n"
-      "ushll       v4.4s, v1.4h, #0              \n"
-      "ushll2      v5.4s, v1.8h, #0              \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "umull       v0.4s, v2.4h, v4.4h           \n"
+      "umull2      v1.4s, v2.8h, v4.8h           \n"
+      "umull       v2.4s, v3.4h, v4.4h           \n"
+      "umull2      v3.4s, v3.8h, v4.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "mul         v2.4s, v2.4s, v6.4s           \n"
-      "mul         v3.4s, v3.4s, v6.4s           \n"
-      "mul         v4.4s, v4.4s, v6.4s           \n"
-      "mul         v5.4s, v5.4s, v6.4s           \n"
-      "shrn        v0.4h, v2.4s, #16             \n"
-      "shrn2       v0.8h, v3.4s, #16             \n"
-      "shrn        v1.4h, v4.4s, #16             \n"
-      "shrn2       v1.8h, v5.4s, #16             \n"
+      "shrn        v0.4h, v0.4s, #16             \n"
+      "shrn2       v0.8h, v1.4s, #16             \n"
+      "shrn        v1.4h, v2.4s, #16             \n"
+      "shrn2       v1.8h, v3.4s, #16             \n"
       "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
@@ -4484,7 +4480,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
         "+r"(dst_y),  // %1
         "+r"(width)   // %2
       : "r"(scale)    // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
 
 // Use scale to convert lsb formats to msb, depending how many bits there are: