[AArch64] Unroll HalfFloat{,1}Row_NEON

The existing C implementation compiled with a recent LLVM is
auto-vectorised and unrolled to process four vectors per loop iteration,
making the Neon implementation slower than the C implementation on
little cores. To avoid this, unroll the Neon implementation to also
process four vectors per iteration.

Reduction in cycle counts observed compared to the existing Neon
implementation:

            | HalfFloat1Row_NEON | HalfFloatRow_NEON
Cortex-A510 |             -37.1% |            -40.8%
Cortex-A520 |             -32.3% |            -37.4%
Cortex-A720 |               0.0% |            -10.6%
  Cortex-X2 |               0.0% |             -7.8%
  Cortex-X4 |              +0.3% |             -6.9%

Bug: b/42280945
Change-Id: I12b474c970fc4355d75ed924c4ca6169badda2bc
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5872805
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
George Steed 2024-05-07 13:41:47 +01:00 committed by Frank Barchard
parent 51d07554a0
commit f00c43f4d6
3 changed files with 48 additions and 23 deletions

View File

@ -5184,9 +5184,9 @@ int HalfFloatPlane(const uint16_t* src_y,
#if defined(HAS_HALFFLOATROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
HalfFloatRow =
(scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
scale == 1.0f ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
HalfFloatRow = scale == 1.0f ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
}
}
#endif

View File

@ -1810,6 +1810,16 @@ ANY11P16(HalfFloat1Row_Any_F16C,
15)
#endif
#ifdef HAS_HALFFLOATROW_NEON
#ifdef __aarch64__
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 15)
ANY11P16(HalfFloat1Row_Any_NEON,
HalfFloat1Row_NEON,
uint16_t,
uint16_t,
2,
2,
15)
#else
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
ANY11P16(HalfFloat1Row_Any_NEON,
HalfFloat1Row_NEON,
@ -1819,6 +1829,7 @@ ANY11P16(HalfFloat1Row_Any_NEON,
2,
7)
#endif
#endif
#ifdef HAS_HALFFLOATROW_MSA
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
#endif

View File

@ -4669,50 +4669,64 @@ void HalfFloat1Row_NEON(const uint16_t* src,
uint16_t* dst,
float /*unused*/,
int width) {
asm volatile (
asm volatile(
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"ldp q0, q1, [%0], #32 \n" // load 16 shorts
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"uxtl v2.4s, v0.4h \n"
"uxtl v4.4s, v1.4h \n"
"uxtl2 v3.4s, v0.8h \n"
"uxtl2 v5.4s, v1.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v2.4s, v2.4s \n"
"scvtf v4.4s, v4.4s \n"
"scvtf v3.4s, v3.4s \n"
"fcvtn v1.4h, v2.4s \n" // 8 half floats
"fcvtn2 v1.8h, v3.4s \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"scvtf v5.4s, v5.4s \n"
"fcvtn v0.4h, v2.4s \n"
"fcvtn v1.4h, v4.4s \n"
"fcvtn2 v0.8h, v3.4s \n"
"fcvtn2 v1.8h, v5.4s \n"
"stp q0, q1, [%1], #32 \n" // store 16 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "cc", "memory", "v1", "v2", "v3");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width) {
asm volatile (
asm volatile(
"1: \n"
"ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
"subs %w2, %w2, #8 \n" // 8 pixels per loop
"uxtl v2.4s, v1.4h \n" // 8 int's
"ldp q0, q1, [%0], #32 \n" // load 16 shorts
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"uxtl v2.4s, v0.4h \n"
"uxtl v4.4s, v1.4h \n"
"uxtl2 v3.4s, v0.8h \n"
"uxtl2 v5.4s, v1.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uxtl2 v3.4s, v1.8h \n"
"scvtf v2.4s, v2.4s \n" // 8 floats
"scvtf v2.4s, v2.4s \n"
"scvtf v4.4s, v4.4s \n"
"scvtf v3.4s, v3.4s \n"
"scvtf v5.4s, v5.4s \n"
"fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
"fmul v4.4s, v4.4s, %3.s[0] \n"
"fmul v3.4s, v3.4s, %3.s[0] \n"
"uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn2 v1.8h, v3.4s, #13 \n"
"st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
"fmul v5.4s, v5.4s, %3.s[0] \n"
"uqshrn v0.4h, v2.4s, #13 \n" // isolate halffloat
"uqshrn v1.4h, v4.4s, #13 \n" // isolate halffloat
"uqshrn2 v0.8h, v3.4s, #13 \n"
"uqshrn2 v1.8h, v5.4s, #13 \n"
"stp q0, q1, [%1], #32 \n" // store 16 shorts
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "v1", "v2", "v3");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
void ByteToFloatRow_NEON(const uint8_t* src,