HalfFloat fix SigIll on aarch64

- Remove special case Scale of 1 which used fp16 cvt but requires cpuid - Port aarch64 to aarch32 - Use C for aarch32 with small (denormal) scale value Bug: 377693555 Change-Id: I38e207e79ac54907ed6e65118b8109288fddb207 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6043392 Reviewed-by: Wan-Teh Chang <wtc@google.com>
2026-06-15 08:26:06 +08:00 · 2024-11-22 13:58:00 -08:00 · 2024-11-22 13:58:00 -08:00 · 595146434a
commit 595146434a
parent 307b951229
6 changed files with 146 additions and 181 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -6670,14 +6670,6 @@ void HalfFloatRow_SVE2(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width);
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float scale,
-                        int width);
-void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
-                            uint16_t* dst_ptr,
-                            float param,
-                            int width);
 void HalfFloat1Row_SVE2(const uint16_t* src,
                        uint16_t* dst,
                        float scale,
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -5208,11 +5208,18 @@ int HalfFloatPlane(const uint16_t* src_y,
  }
 #endif
 #if defined(HAS_HALFFLOATROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    HalfFloatRow =
-        scale == 1.0f ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+  if (TestCpuFlag(kCpuHasNEON)
+#if defined(__arm__)
+      // When scale is 1/65535 the scale * 2^-112 used to convert is a denormal.
+      // But when Neon vmul is asked to multiply a normal float by that
+      // denormal scale, even though the result would have been normal, it
+      // flushes to zero.  The scalar version of vmul supports denormals.
+      && scale >= 1.0f / 4096.0f
+#endif
+  ) {
+    HalfFloatRow = HalfFloatRow_Any_NEON;
    if (IS_ALIGNED(width, 16)) {
-      HalfFloatRow = scale == 1.0f ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+      HalfFloatRow = HalfFloatRow_NEON;
    }
  }
 #endif
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1813,25 +1813,7 @@ ANY11P16(HalfFloat1Row_Any_F16C,
         15)
 #endif
 #ifdef HAS_HALFFLOATROW_NEON
-#ifdef __aarch64__
 ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 15)
-ANY11P16(HalfFloat1Row_Any_NEON,
-         HalfFloat1Row_NEON,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         15)
-#else
-ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
-ANY11P16(HalfFloat1Row_Any_NEON,
-         HalfFloat1Row_NEON,
-         uint16_t,
-         uint16_t,
-         2,
-         2,
-         7)
-#endif
 #endif
 #ifdef HAS_HALFFLOATROW_MSA
 ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3536,59 +3536,41 @@ void SobelYRow_NEON(const uint8_t* src_y0,
 }

 // %y passes a float as a scalar vector for vector * scalar multiply.
-// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the register must be d0 to d15 and indexed with [0] or [1] to access
 // the float in the first or second float of the d-reg

-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile (
-
-      "1:                                        \n"
-      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
-      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
-      "vmovl.u16   q2, d2                        \n"  // 8 int's
-      "vmovl.u16   q3, d3                        \n"
-      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
-      "vcvt.f32.u32 q3, q3                       \n"
-      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
-      "vmul.f32    q3, q3, %y3                   \n"
-      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
-      "vqshrn.u32  d3, q3, #13                   \n"
-      "vst1.8      {q1}, [%1]!                   \n"
-      "bgt         1b                            \n"
-      : "+r"(src),              // %0
-        "+r"(dst),              // %1
-        "+r"(width)             // %2
-      : "w"(1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
-}
-
 void HalfFloatRow_NEON(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
                       int width) {
-  asm volatile (
+    asm volatile (

      "1:                                        \n"
-      "vld1.8      {q1}, [%0]!                   \n"  // load 8 shorts
-      "subs        %2, %2, #8                    \n"  // 8 pixels per loop
-      "vmovl.u16   q2, d2                        \n"  // 8 int's
-      "vmovl.u16   q3, d3                        \n"
-      "vcvt.f32.u32 q2, q2                       \n"  // 8 floats
-      "vcvt.f32.u32 q3, q3                       \n"
-      "vmul.f32    q2, q2, %y3                   \n"  // adjust exponent
-      "vmul.f32    q3, q3, %y3                   \n"
-      "vqshrn.u32  d2, q2, #13                   \n"  // isolate halffloat
-      "vqshrn.u32  d3, q3, #13                   \n"
-      "vst1.8      {q1}, [%1]!                   \n"
+      "vld1.16     {q0, q1}, [%0]!               \n"  // load 16 shorts
+      "subs        %2, %2, #16                   \n"  // 16 pixels per loop
+      "vmovl.u16   q8, d0                        \n"
+      "vmovl.u16   q9, d1                        \n"
+      "vmovl.u16   q10, d2                       \n"
+      "vmovl.u16   q11, d3                       \n"
+      "vcvt.f32.u32 q8, q8                       \n"
+      "vcvt.f32.u32 q9, q9                       \n"
+      "vcvt.f32.u32 q10, q10                     \n"
+      "vcvt.f32.u32 q11, q11                     \n"
+      "vmul.f32    q8, q8, %y3                   \n"  // adjust exponent
+      "vmul.f32    q9, q9, %y3                   \n"
+      "vmul.f32    q10, q10, %y3                 \n"
+      "vmul.f32    q11, q11, %y3                 \n"
+      "vqshrn.u32  d0, q8, #13                   \n"  // isolate halffloat
+      "vqshrn.u32  d1, q9, #13                   \n"
+      "vqshrn.u32  d2, q10, #13                  \n"
+      "vqshrn.u32  d3, q11, #13                  \n"
+      "vst1.16     {q0, q1}, [%1]!               \n" // store 16 fp16
      "bgt         1b                            \n"
-      : "+r"(src),                      // %0
-        "+r"(dst),                      // %1
-        "+r"(width)                     // %2
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
      : "w"(scale * 1.9259299444e-34f)  // %3
-      : "cc", "memory", "q1", "q2", "q3");
+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
 }

 void ByteToFloatRow_NEON(const uint8_t* src,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -4664,37 +4664,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
  );
 }

-// Caveat - rounds float to half float whereas scaling version truncates.
-void HalfFloat1Row_NEON(const uint16_t* src,
-                        uint16_t* dst,
-                        float /*unused*/,
-                        int width) {
-  asm volatile(
-      "1:                                        \n"
-      "ldp         q0, q1, [%0], #32             \n"  // load 16 shorts
-      "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
-      "uxtl        v2.4s, v0.4h                  \n"
-      "uxtl        v4.4s, v1.4h                  \n"
-      "uxtl2       v3.4s, v0.8h                  \n"
-      "uxtl2       v5.4s, v1.8h                  \n"
-      "prfm        pldl1keep, [%0, 448]          \n"
-      "scvtf       v2.4s, v2.4s                  \n"
-      "scvtf       v4.4s, v4.4s                  \n"
-      "scvtf       v3.4s, v3.4s                  \n"
-      "scvtf       v5.4s, v5.4s                  \n"
-      "fcvtn       v0.4h, v2.4s                  \n"
-      "fcvtn       v1.4h, v4.4s                  \n"
-      "fcvtn2      v0.8h, v3.4s                  \n"
-      "fcvtn2      v1.8h, v5.4s                  \n"
-      "stp         q0, q1, [%1], #32             \n"  // store 16 shorts
-      "b.gt        1b                            \n"
-      : "+r"(src),   // %0
-        "+r"(dst),   // %1
-        "+r"(width)  // %2
-      :
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
-}
-
 void HalfFloatRow_NEON(const uint16_t* src,
                       uint16_t* dst,
                       float scale,
@ -4717,10 +4686,10 @@ void HalfFloatRow_NEON(const uint16_t* src,
      "fmul        v3.4s, v3.4s, %3.s[0]         \n"
      "fmul        v5.4s, v5.4s, %3.s[0]         \n"
      "uqshrn      v0.4h, v2.4s, #13             \n"  // isolate halffloat
-      "uqshrn      v1.4h, v4.4s, #13             \n"  // isolate halffloat
+      "uqshrn      v1.4h, v4.4s, #13             \n"
      "uqshrn2     v0.8h, v3.4s, #13             \n"
      "uqshrn2     v1.8h, v5.4s, #13             \n"
-      "stp         q0, q1, [%1], #32             \n"  // store 16 shorts
+      "stp         q0, q1, [%1], #32             \n"  // store 16 fp16
      "b.gt        1b                            \n"
      : "+r"(src),                      // %0
        "+r"(dst),                      // %1
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -1551,14 +1551,14 @@ TEST_F(LibYUVPlanarTest, TestAffine) {
 #endif
 }

-static int TestCopyPlane(int width,
-                         int height,
+static int TestCopyPlane(int benchmark_width,
+                         int benchmark_height,
                         int benchmark_iterations,
                         int disable_cpu_flags,
                         int benchmark_cpu_info,
                         int invert,
                         int off) {
-  int y_plane_size = width * height;
+  const int y_plane_size = benchmark_width * benchmark_height;
  align_buffer_page_end(orig_y, y_plane_size + off);
  align_buffer_page_end(dst_c, y_plane_size);
  align_buffer_page_end(dst_opt, y_plane_size);
@ -1570,13 +1570,13 @@ static int TestCopyPlane(int width,
  // Disable all optimizations.
  MaskCpuFlags(disable_cpu_flags);
  for (int i = 0; i < benchmark_iterations; i++) {
-    CopyPlane(orig_y + off, width, dst_c, width, width, height * invert);
+    CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert);
  }

  // Enable optimizations.
  MaskCpuFlags(benchmark_cpu_info);
  for (int i = 0; i < benchmark_iterations; i++) {
-    CopyPlane(orig_y + off, width, dst_opt, width, width, height * invert);
+    CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert);
  }

  int max_diff = 0;
@ -2479,36 +2479,37 @@ static int TestHalfFloatPlane(int benchmark_width,
                              int disable_cpu_flags,
                              int benchmark_cpu_info,
                              float scale,
-                              int mask) {
+                              int mask,
+                              int invert,
+                              int off) {
  int i, j;
  const int y_plane_size = benchmark_width * benchmark_height * 2;
+  align_buffer_page_end(orig_y, y_plane_size + off);
+  align_buffer_page_end(dst_c, y_plane_size);
+  align_buffer_page_end(dst_opt, y_plane_size);

-  align_buffer_page_end(orig_y, y_plane_size * 3);
-  uint8_t* dst_opt = orig_y + y_plane_size;
-  uint8_t* dst_c = orig_y + y_plane_size * 2;
-
-  MemRandomize(orig_y, y_plane_size);
-  memset(dst_c, 0, y_plane_size);
-  memset(dst_opt, 1, y_plane_size);
+  MemRandomize(orig_y + off, y_plane_size);
+  memset(dst_c, 1, y_plane_size);
+  memset(dst_opt, 2, y_plane_size);

  for (i = 0; i < y_plane_size / 2; ++i) {
-    reinterpret_cast<uint16_t*>(orig_y)[i] &= mask;
+    reinterpret_cast<uint16_t*>(orig_y + off)[i] &= mask;
  }

  // Disable all optimizations.
  MaskCpuFlags(disable_cpu_flags);
  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
+    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
                   reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
-                   scale, benchmark_width, benchmark_height);
+                   scale, benchmark_width, benchmark_height * invert);
  }

  // Enable optimizations.
  MaskCpuFlags(benchmark_cpu_info);
  for (j = 0; j < benchmark_iterations; j++) {
-    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
+    HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
                   reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
-                   scale, benchmark_width, benchmark_height);
+                   scale, benchmark_width, benchmark_height * invert);
  }

  int max_diff = 0;
@ -2525,6 +2526,76 @@ static int TestHalfFloatPlane(int benchmark_width,
  return max_diff;
 }

+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f, 65535, +1, 0);
+  EXPECT_LE(diff, 1);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 511.0f, 511, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
+  int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
+  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, disable_cpu_flags_,
+                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  EXPECT_EQ(0, diff);
+}
+
 #if defined(__arm__)
 static void EnableFlushDenormalToZero(void) {
  uint32_t cw;
@ -2535,78 +2606,40 @@ static void EnableFlushDenormalToZero(void) {
    : "=r"(cw)
    ::"memory", "cc"); // Clobber List
 }
-#endif
+
+static void DisableFlushDenormalToZero(void) {
+  uint32_t cw;
+  asm volatile (
+    "vmrs   %0, fpscr                 \n"
+    "bic    %0, %0, #0x1000000        \n"
+    "vmsr   fpscr, %0                 \n"
+    : "=r"(cw)
+    ::"memory", "cc"); // Clobber List
+}

 // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
 // exponent to be less than 0.  15 - log2(65536) = -1/  This shouldnt normally
 // happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.

-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
-// 32 bit arm rounding on denormal case is off by 1 compared to C.
-#if defined(__arm__)
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
+  // 32 bit arm rounding on denormal case is off by 1 compared to C.
  EnableFlushDenormalToZero();
-#endif
  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 65536.0f, 65535);
+                                benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
+  DisableFlushDenormalToZero();
  EXPECT_EQ(0, diff);
 }

-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
+TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
+  EnableFlushDenormalToZero();
  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f, 65535);
-  EXPECT_LE(diff, 1);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 65535);
+                                benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
+  DisableFlushDenormalToZero();
  EXPECT_EQ(0, diff);
 }
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 1024.0f, 1023);
-  EXPECT_EQ(0, diff);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 512.0f, 511);
-  EXPECT_EQ(0, diff);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4096.0f, 4095);
-  EXPECT_EQ(0, diff);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f / 4095.0f, 4095);
-  EXPECT_EQ(0, diff);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f, 2047);
-  EXPECT_EQ(0, diff);
-}
-
-TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
-  int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
-                                benchmark_iterations_, disable_cpu_flags_,
-                                benchmark_cpu_info_, 1.0f, 4095);
-  EXPECT_LE(diff, 1);
-}
+#endif  // defined(__arm__)

 static float TestByteToFloat(int benchmark_width,
                             int benchmark_height,