From 9f9b5cf660dcfa0d3fdee41cf4ffbe4bb6e95114 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 28 Apr 2025 11:15:30 -0700
Subject: [PATCH] ARGBToUV allow 32 bit x86 build

- make width loop count on stack
- set YMM constants in its own asm block
- make struct for shuffle and add constants
- disable clang format on row_neon.cc function

Bug: 413781394
Change-Id: I263f6862cb7589dc31ac65d118f7ebeb65dbb24a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6495259
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
---
 README.chromium          |   2 +-
 include/libyuv/version.h |   2 +-
 source/row_gcc.cc        | 224 +++++++++++++++++++++------------------
 source/row_neon.cc       |  11 +-
 4 files changed, 129 insertions(+), 110 deletions(-)

diff --git a/README.chromium b/README.chromium
index 52aa1fb5b..09cd61c80 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1908
+Version: 1909
 License: BSD-3-Clause
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e4f992117..7c4feb5fd 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1908
+#define LIBYUV_VERSION 1909
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 8f7980397..6d657d1da 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1642,12 +1642,16 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
 
       "lea         0x40(%0),%0                   \n"
       "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%3                      \n"
+      "subl        $0x10,%3                      \n"
       "jg          1b                            \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+rm"(width)                   // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
       : "m"(rgbuvconstants->kRGBToU),  // %4
         "m"(rgbuvconstants->kRGBToV),  // %5
         "m"(kAddUV128)                 // %6
@@ -1708,13 +1712,17 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
       "vmovdqu     %%ymm0,(%1,%2,1)              \n"
       "lea         0x80(%0),%0                   \n"
       "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
+      "subl        $0x20,%3                      \n"
       "jg          1b                            \n"
       "vzeroupper  \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+rm"(width)                   // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
       : "m"(rgbuvconstants->kRGBToU),  // %4
         "m"(rgbuvconstants->kRGBToV),  // %5
         "m"(kAddUV128),                // %6
@@ -1724,21 +1732,108 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX2
 
-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#ifdef HAS_ARGBTOUVROW_SSSE3
 
-void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct RgbUVConstants* rgbuvconstants) {
+void OMITFP ARGBToUVMatrixRow_SSSE3(
+    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
+    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
+      "movdqa      %0,%%xmm3                     \n"
+      "movdqa      %1,%%xmm4                     \n"
+      "movdqa      %2,%%xmm5                     \n"
+      :
+      : "m"(rgbuvconstants->kRGBToU),  // %0
+        "m"(rgbuvconstants->kRGBToV),  // %1
+        "m"(kAddUV128)                 // %2
+      : "xmm3", "xmm4", "xmm5");
+
+  asm volatile("sub         %1,%2                         \n"
+
+               LABELALIGN
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm0                 \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm1                 \n"
+               "movdqu      0x20(%0),%%xmm2               \n"
+               "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm2                 \n"
+               "movdqu      0x30(%0),%%xmm6               \n"
+               "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm6                 \n"
+               "lea         0x40(%0),%0                   \n"
+               "movdqa      %%xmm0,%%xmm7                 \n"
+               "shufps      $0x88,%%xmm1,%%xmm0           \n"
+               "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+               "pavgb       %%xmm7,%%xmm0                 \n"
+               "movdqa      %%xmm2,%%xmm7                 \n"
+               "shufps      $0x88,%%xmm6,%%xmm2           \n"
+               "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+               "pavgb       %%xmm7,%%xmm2                 \n"
+
+               "movdqa      %%xmm0,%%xmm1                 \n"
+               "movdqa      %%xmm2,%%xmm6                 \n"
+               "pmaddubsw   %%xmm3,%%xmm0                 \n"
+               "pmaddubsw   %%xmm3,%%xmm2                 \n"
+               "pmaddubsw   %%xmm4,%%xmm1                 \n"
+               "pmaddubsw   %%xmm4,%%xmm6                 \n"
+               "phaddw      %%xmm2,%%xmm0                 \n"
+               "phaddw      %%xmm6,%%xmm1                 \n"
+               "movdqa      %%xmm5,%%xmm2                 \n"
+               "movdqa      %%xmm5,%%xmm6                 \n"
+               "psubw       %%xmm0,%%xmm2                 \n"
+               "psubw       %%xmm1,%%xmm6                 \n"
+               "psrlw       $0x8,%%xmm2                   \n"
+               "psrlw       $0x8,%%xmm6                   \n"
+               "packuswb    %%xmm6,%%xmm2                 \n"
+               "movlps      %%xmm2,(%1)                   \n"
+               "movhps      %%xmm2,0x00(%1,%2,1)          \n"
+               "lea         0x8(%1),%1                    \n"
+               "subl        $0x10,%3                      \n"
+               "jg          1b                            \n"
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_u),     // %1
+                 "+r"(dst_v),     // %2
+#if defined(__i386__)
+                 "+m"(width)  // %3
+#else
+                 "+rm"(width)  // %3
+#endif
+               : "r"((intptr_t)(src_stride_argb))  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
+}
+
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+// Coefficients expressed as negatives to allow 128
+struct UVMatrixConstants {
+  lvec8 kShufARGBToUV;
+  ulvec8 kAddUV128;
+};
+
+static const UVMatrixConstants kShufARGBToUV_AVX = {
+    0, 1,   8, 9,   2, 3,   10, 11,  4, 5,   12, 13,  6, 7,   14, 15,
+    0, 1,   8, 9,   2, 3,   10, 11,  4, 5,   12, 13,  6, 7,   14, 15,
+    0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128,
+    0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128};
+
+void OMITFP ARGBToUVMatrixRow_AVX2(
+    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
+    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+      "vbroadcastf128 %0,%%ymm6                  \n"
+      "vbroadcastf128 %1,%%ymm7                  \n"
+      :
+      : "m"(rgbuvconstants->kRGBToU),  // %0
+        "m"(rgbuvconstants->kRGBToV)   // %1
+      :);
+
+  asm volatile(
+      "vmovdqa     32(%5),%%ymm5                 \n"
       "sub         %1,%2                         \n"
 
       LABELALIGN
@@ -1771,7 +1866,7 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
       "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
       "vpackuswb   %%ymm0,%%ymm1,%%ymm0          \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpshufb     (%5),%%ymm0,%%ymm0            \n"
 
       "vextractf128 $0x0,%%ymm0,(%1)             \n"
       "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
@@ -1788,90 +1883,11 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
         "+rm"(width)  // %3
 #endif
       : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUV128),                    // %5
-        "m"(rgbuvconstants->kRGBToU),      // %6
-        "m"(rgbuvconstants->kRGBToV),      // %7
-        "m"(kShufARGBToUV_AVX)             // %8
+        "r"(&kShufARGBToUV_AVX)            // %5
       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
         "xmm7");
 }
 
-#ifdef HAS_ARGBTOUVROW_SSSE3
-
-void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
-                             int src_stride_argb,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"
-      "pmaddubsw   %%xmm3,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "movdqa      %%xmm5,%%xmm2                 \n"
-      "movdqa      %%xmm5,%%xmm6                 \n"
-      "psubw       %%xmm0,%%xmm2                 \n"
-      "psubw       %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "psrlw       $0x8,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm2                 \n"
-      "movlps      %%xmm2,(%1)                   \n"
-      "movhps      %%xmm2,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "subl        $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-#if defined(__i386__)
-        "+m"(width)  // %3
-#else
-        "+rm"(width)  // %3
-#endif
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(rgbuvconstants->kRGBToU),      // %5
-        "m"(rgbuvconstants->kRGBToV),      // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 
 // RGB to BT601 coefficients
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 9dff94e4e..74cc8a939 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -266,10 +266,13 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "vmov.u8     d6, #255                      \n"
-      "1:          \n"  //
-      READYUV422
-      "subs        %[width], %[width], #8        \n" YUVTORGB RGBTORGB8
-          STORERGBA "bgt         1b                            \n"
+      "1:          \n"                                //
+      READYUV422                                      //
+      "subs        %[width], %[width], #8        \n"  //
+      YUVTORGB                                        //
+          RGBTORGB8                                   //
+      STORERGBA                                       //
+      "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
         [src_v] "+r"(src_v),                               // %[src_v]