From 5a17753597d77dee881d9d93097ca2c2079e9409 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 4 May 2026 12:46:28 -0700 Subject: [PATCH] libyuv: Optimize Convert8To8Row_NEON for 32-bit ARM Benchmark (Convert8To8Plane 1280x720, 1000 repeats): 32-bit: 106 ms -> 44 ms 64-bit: 52 ms (unchanged) Bug: libyuv:42280902 Change-Id: I389a482f93404984759ef6223d7d191579d3578d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7812450 Reviewed-by: Justin Green Commit-Queue: Frank Barchard --- source/row_neon.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/source/row_neon.cc b/source/row_neon.cc index d893dd7e3..257398bbe 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -4011,12 +4011,10 @@ void Convert8To8Row_NEON(const uint8_t* src_y, "vmull.u8 q1, d5, d8 \n" "vmull.u8 q2, d6, d8 \n" "vmull.u8 q3, d7, d8 \n" - "vshrn.u16 d0, q0, #8 \n" - "vshrn.u16 d1, q1, #8 \n" - "vshrn.u16 d2, q2, #8 \n" - "vshrn.u16 d3, q3, #8 \n" - "vadd.u8 q0, q0, q5 \n" - "vadd.u8 q1, q1, q5 \n" + "vuzp.8 q0, q1 \n" + "vuzp.8 q2, q3 \n" + "vadd.u8 q0, q1, q5 \n" + "vadd.u8 q1, q3, q5 \n" "vst1.8 {q0, q1}, [%1]! \n" // store 32 pixels "bgt 1b \n" : "+r"(src_y), // %0