Fix for divide row functions used by P010ToI010

Bug: libyuv:951 Change-Id: Id323656cb6f99b1be0be7aaa854d3cc15feeba69 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4166562 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2025-12-06 16:56:55 +08:00 · 2023-01-17 13:06:38 -08:00 · 2023-01-17 13:06:38 -08:00 · 541d8efbaf
commit 541d8efbaf
parent d5aa3d4a76
3 changed files with 36 additions and 36 deletions
--- a/source/convert.cc
+++ b/source/convert.cc
@ -24,7 +24,10 @@ namespace libyuv {
 extern "C" {
 #endif

-// subsample amount uses a shift.
+// Subsample amount uses a shift.
+//   v is value
+//   a is amount to add to round up
+//   s is shift to subsample down
 #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 static __inline int Abs(int v) {
  return v >= 0 ? v : -v;
@ -1291,7 +1294,6 @@ static int PxxxToIxxx(const uint16_t* src_y,
  if (width <= 0 || height == 0) {
    return -1;
  }
-
  ConvertToLSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height,
                       depth);
  SplitUVPlane_16(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3911,31 +3911,29 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                       int scale,
                       int width) {
  asm volatile(
-      "vdup.16     q0, %3                        \n"
+      "vdup.16     q6, %3                        \n"
      "1:                                        \n"
-      "vld1.16     {q1}, [%0]!                   \n"
-      "vld1.16     {q2}, [%0]!                   \n"
-      "vmovl.u16   q3, d2                        \n"
-      "vmovl.u16   q1, d3                        \n"
-      "vmovl.u16   q4, d4                        \n"
-      "vmovl.u16   q2, d5                        \n"
-      "vshl.u32    q3, q3, q0                    \n"
-      "vshl.u32    q4, q4, q0                    \n"
-      "vshl.u32    q1, q1, q0                    \n"
-      "vshl.u32    q2, q2, q0                    \n"
-      "vmovn.u32   d2, q3                        \n"
-      "vmovn.u32   d3, q1                        \n"
-      "vmovn.u32   d4, q4                        \n"
-      "vmovn.u32   d5, q2                        \n"
-      "vst1.16     {q1}, [%1]!                   \n"
-      "vst1.16     {q2}, [%1]!                   \n"
+      "vld1.16     {q0, q1}, [%0]!               \n"
+      "vmovl.u16   q2, d0                        \n"
+      "vmovl.u16   q3, d1                        \n"
+      "vmovl.u16   q4, d2                        \n"
+      "vmovl.u16   q5, d3                        \n"
+      "vmul.u32    q2, q2, q6                    \n"
+      "vmul.u32    q3, q3, q6                    \n"
+      "vmul.u32    q4, q4, q6                    \n"
+      "vmul.u32    q5, q5, q6                    \n"
+      "vshrn.u32   d0, q2, #16                   \n"
+      "vshrn.u32   d1, q3, #16                   \n"
+      "vshrn.u32   d2, q4, #16                   \n"
+      "vshrn.u32   d3, q5, #16                   \n"
+      "vst1.16     {q0, q1}, [%1]!               \n"  // store 16 pixels
      "subs        %2, %2, #16                   \n"  // 16 src pixels per loop
      "bgt         1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
        "+r"(width)   // %2
      : "r"(scale)    // %3
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
 }

 // Use scale to convert lsb formats to msb, depending how many bits there are:
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -4461,30 +4461,30 @@ void DivideRow_16_NEON(const uint16_t* src_y,
                       int scale,
                       int width) {
  asm volatile(
-      "dup         v0.8h, %w3                    \n"
+      "dup         v6.8h, %w3                    \n"
      "1:                                        \n"
-      "ldp         q1, q2, [%0], #32             \n"
-      "ushll       v3.4s, v1.4h, #0              \n"
-      "ushll       v4.4s, v2.4h, #0              \n"
+      "ldp         q0, q1, [%0], #32             \n"
+      "ushll       v2.4s, v0.4h, #0              \n"
+      "ushll2      v3.4s, v0.8h, #0              \n"
+      "ushll       v4.4s, v1.4h, #0              \n"
+      "ushll2      v5.4s, v1.8h, #0              \n"
      "prfm        pldl1keep, [%0, 448]          \n"
-      "ushll2      v1.4s, v1.8h, #0              \n"
-      "ushll2      v2.4s, v2.8h, #0              \n"
-      "mul         v3.4s, v0.4s, v3.4s           \n"
-      "mul         v4.4s, v0.4s, v4.4s           \n"
-      "mul         v1.4s, v0.4s, v1.4s           \n"
-      "mul         v2.4s, v0.4s, v2.4s           \n"
-      "shrn        v3.4h, v3.4s, #16             \n"
-      "shrn        v4.4h, v4.4s, #16             \n"
-      "shrn2       v3.8h, v1.4s, #16             \n"
-      "shrn2       v4.8h, v2.4s, #16             \n"
-      "stp         q3, q3, [%1], #32             \n"  // store 16 pixels
+      "mul         v2.4s, v2.4s, v6.4s           \n"
+      "mul         v3.4s, v3.4s, v6.4s           \n"
+      "mul         v4.4s, v4.4s, v6.4s           \n"
+      "mul         v5.4s, v5.4s, v6.4s           \n"
+      "shrn        v0.4h, v2.4s, #16             \n"
+      "shrn2       v0.8h, v3.4s, #16             \n"
+      "shrn        v1.4h, v4.4s, #16             \n"
+      "shrn2       v1.8h, v5.4s, #16             \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
        "+r"(width)   // %2
      : "r"(scale)    // %3
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }

 // Use scale to convert lsb formats to msb, depending how many bits there are: