ARGBToUV allow 32 bit x86 build

- make width loop count on stack - set YMM constants in its own asm block - make struct for shuffle and add constants - disable clang format on row_neon.cc function Bug: 413781394 Change-Id: I263f6862cb7589dc31ac65d118f7ebeb65dbb24a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6495259 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2025-04-28 11:15:30 -07:00 · 2025-04-28 11:15:30 -07:00 · 9f9b5cf660
commit 9f9b5cf660
parent 1e40e34573
4 changed files with 129 additions and 110 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1908
+Version: 1909
 License: BSD-3-Clause
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1908
+#define LIBYUV_VERSION 1909

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -1642,12 +1642,16 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,

      "lea         0x40(%0),%0                   \n"
      "lea         0x10(%1),%1                   \n"
-      "sub         $0x10,%3                      \n"
+      "subl        $0x10,%3                      \n"
      "jg          1b                            \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+rm"(width)                   // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
      : "m"(rgbuvconstants->kRGBToU),  // %4
        "m"(rgbuvconstants->kRGBToV),  // %5
        "m"(kAddUV128)                 // %6
@ -1708,13 +1712,17 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
      "vmovdqu     %%ymm0,(%1,%2,1)              \n"
      "lea         0x80(%0),%0                   \n"
      "lea         0x20(%1),%1                   \n"
-      "sub         $0x20,%3                      \n"
+      "subl        $0x20,%3                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_argb),                // %0
-        "+r"(dst_u),                   // %1
-        "+r"(dst_v),                   // %2
-        "+rm"(width)                   // %3
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
      : "m"(rgbuvconstants->kRGBToU),  // %4
        "m"(rgbuvconstants->kRGBToV),  // %5
        "m"(kAddUV128),                // %6
@ -1724,21 +1732,108 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX2

-// vpshufb for vphaddw + vpackuswb packed to shorts.
-static const lvec8 kShufARGBToUV_AVX = {
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+#ifdef HAS_ARGBTOUVROW_SSSE3

-void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
-                            int src_stride_argb,
-                            uint8_t* dst_u,
-                            uint8_t* dst_v,
-                            int width,
-                            const struct RgbUVConstants* rgbuvconstants) {
+void OMITFP ARGBToUVMatrixRow_SSSE3(
+    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
+    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
  asm volatile(
-      "vbroadcastf128 %5,%%ymm5                  \n"
-      "vbroadcastf128 %6,%%ymm6                  \n"
-      "vbroadcastf128 %7,%%ymm7                  \n"
+      "movdqa      %0,%%xmm3                     \n"
+      "movdqa      %1,%%xmm4                     \n"
+      "movdqa      %2,%%xmm5                     \n"
+      :
+      : "m"(rgbuvconstants->kRGBToU),  // %0
+        "m"(rgbuvconstants->kRGBToV),  // %1
+        "m"(kAddUV128)                 // %2
+      : "xmm3", "xmm4", "xmm5");
+
+  asm volatile("sub         %1,%2                         \n"
+
+               LABELALIGN
+               "1:          \n"
+               "movdqu      (%0),%%xmm0                   \n"
+               "movdqu      0x00(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm0                 \n"
+               "movdqu      0x10(%0),%%xmm1               \n"
+               "movdqu      0x10(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm1                 \n"
+               "movdqu      0x20(%0),%%xmm2               \n"
+               "movdqu      0x20(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm2                 \n"
+               "movdqu      0x30(%0),%%xmm6               \n"
+               "movdqu      0x30(%0,%4,1),%%xmm7          \n"
+               "pavgb       %%xmm7,%%xmm6                 \n"
+               "lea         0x40(%0),%0                   \n"
+               "movdqa      %%xmm0,%%xmm7                 \n"
+               "shufps      $0x88,%%xmm1,%%xmm0           \n"
+               "shufps      $0xdd,%%xmm1,%%xmm7           \n"
+               "pavgb       %%xmm7,%%xmm0                 \n"
+               "movdqa      %%xmm2,%%xmm7                 \n"
+               "shufps      $0x88,%%xmm6,%%xmm2           \n"
+               "shufps      $0xdd,%%xmm6,%%xmm7           \n"
+               "pavgb       %%xmm7,%%xmm2                 \n"
+
+               "movdqa      %%xmm0,%%xmm1                 \n"
+               "movdqa      %%xmm2,%%xmm6                 \n"
+               "pmaddubsw   %%xmm3,%%xmm0                 \n"
+               "pmaddubsw   %%xmm3,%%xmm2                 \n"
+               "pmaddubsw   %%xmm4,%%xmm1                 \n"
+               "pmaddubsw   %%xmm4,%%xmm6                 \n"
+               "phaddw      %%xmm2,%%xmm0                 \n"
+               "phaddw      %%xmm6,%%xmm1                 \n"
+               "movdqa      %%xmm5,%%xmm2                 \n"
+               "movdqa      %%xmm5,%%xmm6                 \n"
+               "psubw       %%xmm0,%%xmm2                 \n"
+               "psubw       %%xmm1,%%xmm6                 \n"
+               "psrlw       $0x8,%%xmm2                   \n"
+               "psrlw       $0x8,%%xmm6                   \n"
+               "packuswb    %%xmm6,%%xmm2                 \n"
+               "movlps      %%xmm2,(%1)                   \n"
+               "movhps      %%xmm2,0x00(%1,%2,1)          \n"
+               "lea         0x8(%1),%1                    \n"
+               "subl        $0x10,%3                      \n"
+               "jg          1b                            \n"
+               : "+r"(src_argb),  // %0
+                 "+r"(dst_u),     // %1
+                 "+r"(dst_v),     // %2
+#if defined(__i386__)
+                 "+m"(width)  // %3
+#else
+                 "+rm"(width)  // %3
+#endif
+               : "r"((intptr_t)(src_stride_argb))  // %4
+               : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+                 "xmm6", "xmm7");
+}
+
+#endif  // HAS_ARGBTOUVROW_SSSE3
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+// Coefficients expressed as negatives to allow 128
+struct UVMatrixConstants {
+  lvec8 kShufARGBToUV;
+  ulvec8 kAddUV128;
+};
+
+static const UVMatrixConstants kShufARGBToUV_AVX = {
+    0, 1,   8, 9,   2, 3,   10, 11,  4, 5,   12, 13,  6, 7,   14, 15,
+    0, 1,   8, 9,   2, 3,   10, 11,  4, 5,   12, 13,  6, 7,   14, 15,
+    0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128,
+    0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128};
+
+void OMITFP ARGBToUVMatrixRow_AVX2(
+    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
+    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
+  asm volatile(
+      "vbroadcastf128 %0,%%ymm6                  \n"
+      "vbroadcastf128 %1,%%ymm7                  \n"
+      :
+      : "m"(rgbuvconstants->kRGBToU),  // %0
+        "m"(rgbuvconstants->kRGBToV)   // %1
+      :);
+
+  asm volatile(
+      "vmovdqa     32(%5),%%ymm5                 \n"
      "sub         %1,%2                         \n"

      LABELALIGN
@ -1771,7 +1866,7 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
      "vpsrlw      $0x8,%%ymm0,%%ymm0            \n"
      "vpackuswb   %%ymm0,%%ymm1,%%ymm0          \n"
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-      "vpshufb     %8,%%ymm0,%%ymm0              \n"
+      "vpshufb     (%5),%%ymm0,%%ymm0            \n"

      "vextractf128 $0x0,%%ymm0,(%1)             \n"
      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
@ -1788,90 +1883,11 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
        "+rm"(width)  // %3
 #endif
      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kAddUV128),                    // %5
-        "m"(rgbuvconstants->kRGBToU),      // %6
-        "m"(rgbuvconstants->kRGBToV),      // %7
-        "m"(kShufARGBToUV_AVX)             // %8
+        "r"(&kShufARGBToUV_AVX)            // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }

-#ifdef HAS_ARGBTOUVROW_SSSE3
-
-void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
-                             int src_stride_argb,
-                             uint8_t* dst_u,
-                             uint8_t* dst_v,
-                             int width,
-                             const struct RgbUVConstants* rgbuvconstants) {
-  asm volatile(
-      "movdqa      %5,%%xmm3                     \n"
-      "movdqa      %6,%%xmm4                     \n"
-      "movdqa      %7,%%xmm5                     \n"
-      "sub         %1,%2                         \n"
-
-      LABELALIGN
-      "1:          \n"
-      "movdqu      (%0),%%xmm0                   \n"
-      "movdqu      0x00(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqu      0x10(%0),%%xmm1               \n"
-      "movdqu      0x10(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm1                 \n"
-      "movdqu      0x20(%0),%%xmm2               \n"
-      "movdqu      0x20(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-      "movdqu      0x30(%0),%%xmm6               \n"
-      "movdqu      0x30(%0,%4,1),%%xmm7          \n"
-      "pavgb       %%xmm7,%%xmm6                 \n"
-      "lea         0x40(%0),%0                   \n"
-      "movdqa      %%xmm0,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm1,%%xmm0           \n"
-      "shufps      $0xdd,%%xmm1,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm0                 \n"
-      "movdqa      %%xmm2,%%xmm7                 \n"
-      "shufps      $0x88,%%xmm6,%%xmm2           \n"
-      "shufps      $0xdd,%%xmm6,%%xmm7           \n"
-      "pavgb       %%xmm7,%%xmm2                 \n"
-
-      "movdqa      %%xmm0,%%xmm1                 \n"
-      "movdqa      %%xmm2,%%xmm6                 \n"
-      "pmaddubsw   %%xmm3,%%xmm0                 \n"
-      "pmaddubsw   %%xmm3,%%xmm2                 \n"
-      "pmaddubsw   %%xmm4,%%xmm1                 \n"
-      "pmaddubsw   %%xmm4,%%xmm6                 \n"
-      "phaddw      %%xmm2,%%xmm0                 \n"
-      "phaddw      %%xmm6,%%xmm1                 \n"
-      "movdqa      %%xmm5,%%xmm2                 \n"
-      "movdqa      %%xmm5,%%xmm6                 \n"
-      "psubw       %%xmm0,%%xmm2                 \n"
-      "psubw       %%xmm1,%%xmm6                 \n"
-      "psrlw       $0x8,%%xmm2                   \n"
-      "psrlw       $0x8,%%xmm6                   \n"
-      "packuswb    %%xmm6,%%xmm2                 \n"
-      "movlps      %%xmm2,(%1)                   \n"
-      "movhps      %%xmm2,0x00(%1,%2,1)          \n"
-      "lea         0x8(%1),%1                    \n"
-      "subl        $0x10,%3                      \n"
-      "jg          1b                            \n"
-      : "+r"(src_argb),  // %0
-        "+r"(dst_u),     // %1
-        "+r"(dst_v),     // %2
-#if defined(__i386__)
-        "+m"(width)  // %3
-#else
-        "+rm"(width)  // %3
-#endif
-      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(rgbuvconstants->kRGBToU),      // %5
-        "m"(rgbuvconstants->kRGBToV),      // %6
-        "m"(kAddUV128)                     // %7
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
-        "xmm7");
-}
-
-#endif  // HAS_ARGBTOUVROW_SSSE3
-
 #ifdef HAS_ARGBTOUV444ROW_SSSE3

 // RGB to BT601 coefficients
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -266,10 +266,13 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "vmov.u8     d6, #255                      \n"
-      "1:          \n"  //
-      READYUV422
-      "subs        %[width], %[width], #8        \n" YUVTORGB RGBTORGB8
-          STORERGBA "bgt         1b                            \n"
+      "1:          \n"                                //
+      READYUV422                                      //
+      "subs        %[width], %[width], #8        \n"  //
+      YUVTORGB                                        //
+          RGBTORGB8                                   //
+      STORERGBA                                       //
+      "bgt         1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
        [src_v] "+r"(src_v),                               // %[src_v]