From 356232b687b98328aa28c64889b429a7649c0db1 Mon Sep 17 00:00:00 2001 From: George Steed Date: Thu, 14 Mar 2024 05:12:21 +0000 Subject: [PATCH] [AArch64] Replace UQXTN{,2} with UZP2 in Convert16To8Row_NEON The existing code makes use of a pair of shifts to put the bits we want in the low part of each vector lane and then a pair of UQXTN and UQXTN2 instructions to perform a saturating cast down from 16-bit elements to 8-bit elements. We can instead achieve the same thing by adding eight to the first shift amount so that the bits we want appear in the high half of the lane, doing the saturation at the same time, and then simply use UZP2 to pull out the high halves of each lane in a single instruction. Reduction in runtime for Convert16To8Row_NEON: Cortex-A55: -19.7% Cortex-A510: -23.5% Cortex-A76: -35.4% Cortex-X2: -34.1% Bug: libyuv:976 Change-Id: I9a80c0f4f2c6b5203f23e422c0970d3167052f91 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5463950 Reviewed-by: Frank Barchard --- source/row_neon64.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index ef0a82d45..f065d8d8b 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -4701,16 +4701,18 @@ void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + // 15 - clz(scale), + 8 to shift result into the high half of the lane to + // saturate, then we can just use UZP2 to narrow rather than a pair of + // saturating narrow instructions. + int shift = 23 - __builtin_clz((int32_t)scale); asm volatile( "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" - "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative - "ushl v1.8h, v1.8h, v2.8h \n" + "uqshl v0.8h, v0.8h, v2.8h \n" + "uqshl v1.8h, v1.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" - "uqxtn v0.8b, v0.8h \n" - "uqxtn2 v0.16b, v1.8h \n" + "uzp2 v0.16b, v0.16b, v1.16b \n" "subs %w2, %w2, #16 \n" // 16 src pixels per loop "str q0, [%1], #16 \n" // store 16 pixels "b.gt 1b \n"