diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index ef0a82d45..f065d8d8b 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -4701,16 +4701,18 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width) {
-  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
+  // saturate, then we can just use UZP2 to narrow rather than a pair of
+  // saturating narrow instructions.
+  int shift = 23 - __builtin_clz((int32_t)scale);
   asm volatile(
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
-      "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
-      "ushl        v1.8h, v1.8h, v2.8h           \n"
+      "uqshl       v0.8h, v0.8h, v2.8h           \n"
+      "uqshl       v1.8h, v1.8h, v2.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
-      "uqxtn       v0.8b, v0.8h                  \n"
-      "uqxtn2      v0.16b, v1.8h                 \n"
+      "uzp2        v0.16b, v0.16b, v1.16b        \n"
       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "str         q0, [%1], #16                 \n"  // store 16 pixels
       "b.gt        1b                            \n"