From ce32eb773fd66239b794abee1e1ee53fdbc15bc0 Mon Sep 17 00:00:00 2001
From: George Steed <george.steed@arm.com>
Date: Fri, 19 Apr 2024 13:24:52 +0100
Subject: [PATCH] [AArch64] Avoid extraneous CMP in I{444,422}ToARGBRow_SVE2
 impl

We can use subs to set condition flags as part of the subtract, no need
for a separate compare instruction. No performance difference observed
from this change, but it now matches the other SVE2 kernels.

Also remove unnecessary volatile from asm blocks.

Bug: libyuv:973
Change-Id: I9bb4f5f1101086602f7d5223feaeae0fb63b385c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5463951
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 source/row_sve.cc | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/source/row_sve.cc b/source/row_sve.cc
index 4a4383322..a7048b65a 100644
--- a/source/row_sve.cc
+++ b/source/row_sve.cc
@@ -85,29 +85,32 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm volatile(
-      "cnth     %[vl]                                   \n"
+  asm("cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n" /* A */
-      "cmp      %w[width], %w[vl]                       \n"
-      "b.le     2f                                      \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
 
       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p1.h                                    \n"
       "1:                                               \n" READYUV444_SVE
           I4XXTORGB_SVE RGBTORGBA8_SVE
-      "sub      %w[width], %w[width], %w[vl]            \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
-      "cmp      %w[width], %w[vl]                       \n"
-      "b.gt     1b                                      \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
 
       // Calculate a predicate for the final iteration to deal with the tail.
-      "2:                                               \n"
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV444_SVE
           I4XXTORGB_SVE RGBTORGBA8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+
+      "99:                                              \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]
@@ -126,30 +129,32 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   uint64_t vl;
-  asm volatile(
-      "cnth     %[vl]                                   \n"
+  asm("cnth     %[vl]                                   \n"
       "ptrue    p0.b                                    \n" YUVTORGB_SVE_SETUP
       "dup      z19.b, #255                             \n" /* A */
-      "cmp      %w[width], %w[vl]                       \n"
-      "b.le     2f                                      \n"
+      "subs     %w[width], %w[width], %w[vl]            \n"
+      "b.lt     2f                                      \n"
 
       // Run bulk of computation with an all-true predicate to avoid predicate
       // generation overhead.
       "ptrue    p1.h                                    \n"
-      "sub      %w[width], %w[width], %w[vl]            \n"
       "1:                                               \n" READYUV422_SVE
           I4XXTORGB_SVE RGBTORGBA8_SVE
       "subs     %w[width], %w[width], %w[vl]            \n"
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
       "add      %[dst_argb], %[dst_argb], %[vl], lsl #2 \n"
-      "b.gt     1b                                      \n"
-      "add      %w[width], %w[width], %w[vl]            \n"
+      "b.ge     1b                                      \n"
+
+      "2:                                               \n"
+      "adds     %w[width], %w[width], %w[vl]            \n"
+      "b.eq     99f                                     \n"
 
       // Calculate a predicate for the final iteration to deal with the tail.
-      "2:                                               \n"
       "whilelt  p1.h, wzr, %w[width]                    \n" READYUV422_SVE
           I4XXTORGB_SVE RGBTORGBA8_SVE
       "st2h     {z16.h, z17.h}, p1, [%[dst_argb]]       \n"
+
+      "99:                                              \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]