HalfFloat fix SigIll on aarch64

- Remove special case Scale of 1 which used fp16 cvt but requires cpuid
- Port aarch64 to aarch32
- Use C for aarch32 with small (denormal) scale value

Bug: 377693555
Change-Id: I38e207e79ac54907ed6e65118b8109288fddb207
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6043392
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2024-11-22 13:58:00 -08:00
parent 307b951229
commit 595146434a
6 changed files with 146 additions and 181 deletions

View File

@ -6670,14 +6670,6 @@ void HalfFloatRow_SVE2(const uint16_t* src,
uint16_t* dst,
float scale,
int width);
void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width);
void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
float param,
int width);
void HalfFloat1Row_SVE2(const uint16_t* src,
uint16_t* dst,
float scale,

View File

@ -5208,11 +5208,18 @@ int HalfFloatPlane(const uint16_t* src_y,
}
#endif
#if defined(HAS_HALFFLOATROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HalfFloatRow =
scale == 1.0f ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
if (TestCpuFlag(kCpuHasNEON)
#if defined(__arm__)
// When scale is 1/65535 the scale * 2^-112 used to convert is a denormal.
// But when Neon vmul is asked to multiply a normal float by that
// denormal scale, even though the result would have been normal, it
// flushes to zero. The scalar version of vmul supports denormals.
&& scale >= 1.0f / 4096.0f
#endif
) {
HalfFloatRow = HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = scale == 1.0f ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
HalfFloatRow = HalfFloatRow_NEON;
}
}
#endif

View File

@ -1813,25 +1813,7 @@ ANY11P16(HalfFloat1Row_Any_F16C,
15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
#ifdef __aarch64__
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 15)
ANY11P16(HalfFloat1Row_Any_NEON,
HalfFloat1Row_NEON,
uint16_t,
uint16_t,
2,
2,
15)
#else
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON,
HalfFloat1Row_NEON,
uint16_t,
uint16_t,
2,
2,
7)
#endif
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)

View File

@ -3536,59 +3536,41 @@ void SobelYRow_NEON(const uint8_t* src_y0,
}
// %y passes a float as a scalar vector for vector * scalar multiply.
// the regoster must be d0 to d15 and indexed with [0] or [1] to access
// the register must be d0 to d15 and indexed with [0] or [1] to access
// the float in the first or second float of the d-reg
void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst,
float /*unused*/,
int width) {
asm volatile (
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(1.9259299444e-34f) // %3
: "cc", "memory", "q1", "q2", "q3");
}
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width) {
asm volatile (
asm volatile (
"1: \n"
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
"subs %2, %2, #8 \n" // 8 pixels per loop
"vmovl.u16 q2, d2 \n" // 8 int's
"vmovl.u16 q3, d3 \n"
"vcvt.f32.u32 q2, q2 \n" // 8 floats
"vcvt.f32.u32 q3, q3 \n"
"vmul.f32 q2, q2, %y3 \n" // adjust exponent
"vmul.f32 q3, q3, %y3 \n"
"vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
"vqshrn.u32 d3, q3, #13 \n"
"vst1.8 {q1}, [%1]! \n"
"vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
"subs %2, %2, #16 \n" // 16 pixels per loop
"vmovl.u16 q8, d0 \n"
"vmovl.u16 q9, d1 \n"
"vmovl.u16 q10, d2 \n"
"vmovl.u16 q11, d3 \n"
"vcvt.f32.u32 q8, q8 \n"
"vcvt.f32.u32 q9, q9 \n"
"vcvt.f32.u32 q10, q10 \n"
"vcvt.f32.u32 q11, q11 \n"
"vmul.f32 q8, q8, %y3 \n" // adjust exponent
"vmul.f32 q9, q9, %y3 \n"
"vmul.f32 q10, q10, %y3 \n"
"vmul.f32 q11, q11, %y3 \n"
"vqshrn.u32 d0, q8, #13 \n" // isolate halffloat
"vqshrn.u32 d1, q9, #13 \n"
"vqshrn.u32 d2, q10, #13 \n"
"vqshrn.u32 d3, q11, #13 \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q1", "q2", "q3");
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
}
void ByteToFloatRow_NEON(const uint8_t* src,

View File

@ -4664,37 +4664,6 @@ void SobelYRow_NEON(const uint8_t* src_y0,
);
}
// Caveat - rounds float to half float whereas scaling version truncates.
void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst,
float /*unused*/,
int width) {
asm volatile(
"1: \n"
"ldp q0, q1, [%0], #32 \n" // load 16 shorts
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"uxtl v2.4s, v0.4h \n"
"uxtl v4.4s, v1.4h \n"
"uxtl2 v3.4s, v0.8h \n"
"uxtl2 v5.4s, v1.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"scvtf v2.4s, v2.4s \n"
"scvtf v4.4s, v4.4s \n"
"scvtf v3.4s, v3.4s \n"
"scvtf v5.4s, v5.4s \n"
"fcvtn v0.4h, v2.4s \n"
"fcvtn v1.4h, v4.4s \n"
"fcvtn2 v0.8h, v3.4s \n"
"fcvtn2 v1.8h, v5.4s \n"
"stp q0, q1, [%1], #32 \n" // store 16 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
@ -4717,10 +4686,10 @@ void HalfFloatRow_NEON(const uint16_t* src,
"fmul v3.4s, v3.4s, %3.s[0] \n"
"fmul v5.4s, v5.4s, %3.s[0] \n"
"uqshrn v0.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn v1.4h, v4.4s, #13 \n" // isolate halffloat
"uqshrn v1.4h, v4.4s, #13 \n"
"uqshrn2 v0.8h, v3.4s, #13 \n"
"uqshrn2 v1.8h, v5.4s, #13 \n"
"stp q0, q1, [%1], #32 \n" // store 16 shorts
"stp q0, q1, [%1], #32 \n" // store 16 fp16
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1

View File

@ -1551,14 +1551,14 @@ TEST_F(LibYUVPlanarTest, TestAffine) {
#endif
}
static int TestCopyPlane(int width,
int height,
static int TestCopyPlane(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
int disable_cpu_flags,
int benchmark_cpu_info,
int invert,
int off) {
int y_plane_size = width * height;
const int y_plane_size = benchmark_width * benchmark_height;
align_buffer_page_end(orig_y, y_plane_size + off);
align_buffer_page_end(dst_c, y_plane_size);
align_buffer_page_end(dst_opt, y_plane_size);
@ -1570,13 +1570,13 @@ static int TestCopyPlane(int width,
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, width, dst_c, width, width, height * invert);
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, width, dst_opt, width, width, height * invert);
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert);
}
int max_diff = 0;
@ -2479,36 +2479,37 @@ static int TestHalfFloatPlane(int benchmark_width,
int disable_cpu_flags,
int benchmark_cpu_info,
float scale,
int mask) {
int mask,
int invert,
int off) {
int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 2;
align_buffer_page_end(orig_y, y_plane_size + off);
align_buffer_page_end(dst_c, y_plane_size);
align_buffer_page_end(dst_opt, y_plane_size);
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8_t* dst_opt = orig_y + y_plane_size;
uint8_t* dst_c = orig_y + y_plane_size * 2;
MemRandomize(orig_y, y_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
MemRandomize(orig_y + off, y_plane_size);
memset(dst_c, 1, y_plane_size);
memset(dst_opt, 2, y_plane_size);
for (i = 0; i < y_plane_size / 2; ++i) {
reinterpret_cast<uint16_t*>(orig_y)[i] &= mask;
reinterpret_cast<uint16_t*>(orig_y + off)[i] &= mask;
}
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
scale, benchmark_width, benchmark_height);
scale, benchmark_width, benchmark_height * invert);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y), benchmark_width * 2,
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
scale, benchmark_width, benchmark_height);
scale, benchmark_width, benchmark_height * invert);
}
int max_diff = 0;
@ -2525,6 +2526,76 @@ static int TestHalfFloatPlane(int benchmark_width,
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f, 65535, +1, 0);
EXPECT_LE(diff, 1);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 511.0f, 511, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff);
}
#if defined(__arm__)
static void EnableFlushDenormalToZero(void) {
uint32_t cw;
@ -2535,78 +2606,40 @@ static void EnableFlushDenormalToZero(void) {
: "=r"(cw)
::"memory", "cc"); // Clobber List
}
#endif
static void DisableFlushDenormalToZero(void) {
uint32_t cw;
asm volatile (
"vmrs %0, fpscr \n"
"bic %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)
::"memory", "cc"); // Clobber List
}
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
// exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally
// happen since scale is 1/(1<<bits) where bits is 9, 10 or 12.
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
// 32 bit arm rounding on denormal case is off by 1 compared to C.
#if defined(__arm__)
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
// 32 bit arm rounding on denormal case is off by 1 compared to C.
EnableFlushDenormalToZero();
#endif
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65536.0f, 65535);
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
DisableFlushDenormalToZero();
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f, 65535);
EXPECT_LE(diff, 1);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 65535);
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
DisableFlushDenormalToZero();
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 1024.0f, 1023);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 512.0f, 511);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Offby1) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4095.0f, 4095);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_One) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f, 2047);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f, 4095);
EXPECT_LE(diff, 1);
}
#endif // defined(__arm__)
static float TestByteToFloat(int benchmark_width,
int benchmark_height,