From 4c3d7d517ae80dbe5e222f7dcb11659f5b240f11 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Wed, 8 Apr 2026 18:48:12 -0700
Subject: [PATCH] ARGBToUV444 for AVX512

1.27x faster on AMD Zen5 (turin)

Now AVX512
perf record ./libyuv_test '--gunit_filter=*ARGBToI444_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=10000 --libyuv_flags=-1 --libyuv_cpu_info=-1

[       OK ] LibYUVConvertTest.ARGBToI444_Opt (1071 ms)
Overhead  Symbol
  53.49%  ARGBToYRow_AVX2
  44.70%  ARGBToUV444Row_AVX512BW

Was AVX2
[       OK ] LibYUVConvertTest.ARGBToI444_Opt (1369 ms)
  61.06%  ARGBToUV444Row_AVX2
  37.67%  ARGBToYRow_AVX2

Bug:  libyuv:42280902
Change-Id: I306fbac656d6f7834ce1559e86d01eb34931ec3c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7738362
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
---
 README.chromium             |   2 +-
 include/libyuv/row.h        |  34 ++++++++++--
 include/libyuv/version.h    |   2 +-
 source/convert.cc           |  16 ++++++
 source/convert_from_argb.cc |  24 +++++++++
 source/row_any.cc           |   9 ++++
 source/row_gcc.cc           | 102 ++++++++++++++++++++++++++++++++++++
 7 files changed, 182 insertions(+), 7 deletions(-)

diff --git a/README.chromium b/README.chromium
index 592fc1899..698e99b24 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1922
+Version: 1923
 Revision: DEPS
 License: BSD-3-Clause
 License File: LICENSE
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index db875b74f..9c11f3199 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -379,6 +379,9 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \
     (defined(CLANG_HAS_AVX512))
 #define HAS_I422TOARGBROW_AVX512BW
+#define HAS_ARGBTOUV444ROW_AVX512BW
+#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
+#define HAS_ARGBTOUVJ444ROW_AVX512BW
 #endif
 
 // The following are available on Neon platforms:
@@ -2156,6 +2159,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
                                uint8_t* dst_v,
                                int width,
                                const struct ArgbConstants* c);
+void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
+                                   uint8_t* dst_u,
+                                   uint8_t* dst_v,
+                                   int width,
+                                   const struct ArgbConstants* c);
 void ARGBToUVMatrixRow_Any_SSSE3(const uint8_t* src_argb,
                                  int src_stride_argb,
                                  uint8_t* dst_u,
@@ -2178,11 +2186,11 @@ void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb,
                                    uint8_t* dst_v,
                                    int width,
                                    const struct ArgbConstants* c);
-void ARGBToUV444MatrixRow_Any_AVX2(const uint8_t* src_argb,
-                                   uint8_t* dst_u,
-                                   uint8_t* dst_v,
-                                   int width,
-                                   const struct ArgbConstants* c);
+void ARGBToUV444MatrixRow_Any_AVX512BW(const uint8_t* src_argb,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width,
+                                       const struct ArgbConstants* c);
 
 void ABGRToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
 void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
@@ -2735,19 +2743,35 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
+void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
 void ARGBToUV444Row_Any_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
+void ARGBToUV444Row_Any_AVX512BW(const uint8_t* src_ptr,
+                                 uint8_t* dst_u,
+                                 uint8_t* dst_v,
+                                 int width);
 
 void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
 void ARGBToUVJ444Row_Any_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_u,
                               uint8_t* dst_v,
                               int width);
+void ARGBToUVJ444Row_Any_AVX512BW(const uint8_t* src_ptr,
+                                  uint8_t* dst_u,
+                                  uint8_t* dst_v,
+                                  int width);
 
 void ARGBToUV444Row_C(const uint8_t* src_argb,
                       uint8_t* dst_u,
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index e8fb3ed5c..c132cdafb 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1922
+#define LIBYUV_VERSION 1923
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index cddaf961b..fbc0ea26e 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -3591,6 +3591,14 @@ int RAWToI444(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUV444Row = ARGBToUV444Row_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
@@ -3794,6 +3802,14 @@ int RAWToJ444(const uint8_t* src_raw,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUVJ444ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index c7bf41ea8..d3353ee79 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUV444Row = ARGBToUV444Row_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
@@ -200,6 +208,14 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444MATRIXROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
@@ -2638,6 +2654,14 @@ int ARGBToJ444(const uint8_t* src_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUVJ444ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    ARGBToUVJ444Row = ARGBToUVJ444Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      ARGBToUVJ444Row = ARGBToUVJ444Row_AVX512BW;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOUVJ444ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVJ444Row = ARGBToUVJ444Row_Any_NEON;
diff --git a/source/row_any.cc b/source/row_any.cc
index f34f3eb2e..ff8b980a4 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2030,9 +2030,15 @@ ANY12(ARGBToUVJ444Row_Any_SSSE3, ARGBToUVJ444Row_SSSE3, 0, 4, 0, 15)
 #ifdef HAS_ARGBTOUV444ROW_AVX2
 ANY12(ARGBToUV444Row_Any_AVX2, ARGBToUV444Row_AVX2, 0, 4, 0, 31)
 #endif
+#ifdef HAS_ARGBTOUV444ROW_AVX512BW
+ANY12(ARGBToUV444Row_Any_AVX512BW, ARGBToUV444Row_AVX512BW, 0, 4, 0, 63)
+#endif
 #ifdef HAS_ARGBTOUVJ444ROW_AVX2
 ANY12(ARGBToUVJ444Row_Any_AVX2, ARGBToUVJ444Row_AVX2, 0, 4, 0, 31)
 #endif
+#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW
+ANY12(ARGBToUVJ444Row_Any_AVX512BW, ARGBToUVJ444Row_AVX512BW, 0, 4, 0, 63)
+#endif
 #ifdef HAS_YUY2TOUV422ROW_AVX2
 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31)
 ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31)
@@ -2250,6 +2256,9 @@ ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7)
 #ifdef HAS_ARGBTOUV444MATRIXROW_AVX2
 ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31)
 #endif
+#ifdef HAS_ARGBTOUV444MATRIXROW_AVX512BW
+ANY12M(ARGBToUV444MatrixRow_Any_AVX512BW, ARGBToUV444MatrixRow_AVX512BW, 4, 63)
+#endif
 #ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3
 ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
 #endif
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 48998d323..dc4957a45 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1723,6 +1723,88 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX2
 
+#ifdef HAS_ARGBTOUV444ROW_AVX512BW
+static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
+                                                    2, 6, 10, 14, 3, 7, 11, 15};
+
+void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
+                                   uint8_t* dst_u,
+                                   uint8_t* dst_v,
+                                   int width,
+                                   const struct ArgbConstants* c) {
+  asm volatile(
+      "vbroadcasti64x4 0x20(%4),%%zmm3               \n"  // kRGBToU
+      "vbroadcasti64x4 0x40(%4),%%zmm4               \n"  // kRGBToV
+      "vpternlogd  $0xff,%%zmm16,%%zmm16,%%zmm16 \n"  // -1
+      "vpsllw      $15,%%zmm16,%%zmm5            \n"  // 0x8000
+      "vmovups     %5,%%zmm7                     \n"
+      "sub         %1,%2                         \n"
+
+      LABELALIGN
+      "1:          \n"
+      "vmovups     (%0),%%zmm0                   \n"
+      "vmovups     0x40(%0),%%zmm1               \n"
+      "vmovups     0x80(%0),%%zmm2               \n"
+      "vmovups     0xc0(%0),%%zmm6               \n"
+      "vpmaddubsw  %%zmm3,%%zmm0,%%zmm0          \n"
+      "vpmaddubsw  %%zmm3,%%zmm1,%%zmm1          \n"
+      "vpmaddubsw  %%zmm3,%%zmm2,%%zmm2          \n"
+      "vpmaddubsw  %%zmm3,%%zmm6,%%zmm6          \n"
+      "vpmaddwd    %%zmm16,%%zmm0,%%zmm0         \n"
+      "vpmaddwd    %%zmm16,%%zmm1,%%zmm1         \n"
+      "vpmaddwd    %%zmm16,%%zmm2,%%zmm2         \n"
+      "vpmaddwd    %%zmm16,%%zmm6,%%zmm6         \n"
+      "vpackssdw   %%zmm1,%%zmm0,%%zmm0          \n"  // mutates
+      "vpackssdw   %%zmm6,%%zmm2,%%zmm2          \n"
+      "vpsubw      %%zmm5,%%zmm0,%%zmm0          \n"
+      "vpsubw      %%zmm5,%%zmm2,%%zmm2          \n"
+      "vpsrlw      $0x8,%%zmm0,%%zmm0            \n"
+      "vpsrlw      $0x8,%%zmm2,%%zmm2            \n"
+      "vpackuswb   %%zmm2,%%zmm0,%%zmm0          \n"  // mutates
+      "vpermd      %%zmm0,%%zmm7,%%zmm0          \n"  // unmutate.
+      "vmovups     %%zmm0,(%1)                   \n"
+
+      "vmovups     (%0),%%zmm0                   \n"
+      "vmovups     0x40(%0),%%zmm1               \n"
+      "vmovups     0x80(%0),%%zmm2               \n"
+      "vmovups     0xc0(%0),%%zmm6               \n"
+      "vpmaddubsw  %%zmm4,%%zmm0,%%zmm0          \n"
+      "vpmaddubsw  %%zmm4,%%zmm1,%%zmm1          \n"
+      "vpmaddubsw  %%zmm4,%%zmm2,%%zmm2          \n"
+      "vpmaddubsw  %%zmm4,%%zmm6,%%zmm6          \n"
+      "vpmaddwd    %%zmm16,%%zmm0,%%zmm0         \n"
+      "vpmaddwd    %%zmm16,%%zmm1,%%zmm1         \n"
+      "vpmaddwd    %%zmm16,%%zmm2,%%zmm2         \n"
+      "vpmaddwd    %%zmm16,%%zmm6,%%zmm6         \n"
+      "vpackssdw   %%zmm1,%%zmm0,%%zmm0          \n"  // mutates
+      "vpackssdw   %%zmm6,%%zmm2,%%zmm2          \n"
+      "vpsubw      %%zmm5,%%zmm0,%%zmm0          \n"
+      "vpsubw      %%zmm5,%%zmm2,%%zmm2          \n"
+      "vpsrlw      $0x8,%%zmm0,%%zmm0            \n"
+      "vpsrlw      $0x8,%%zmm2,%%zmm2            \n"
+      "vpackuswb   %%zmm2,%%zmm0,%%zmm0          \n"  // mutates
+      "vpermd      %%zmm0,%%zmm7,%%zmm0          \n"  // unmutate.
+      "vmovups     %%zmm0,(%1,%2,1)              \n"
+      "lea         0x100(%0),%0                  \n"
+      "lea         0x40(%1),%1                   \n"
+      "subl        $0x40,%3                      \n"
+      "jg          1b                            \n"
+      "vzeroupper  \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+#if defined(__i386__)
+        "+m"(width)  // %3
+#else
+        "+rm"(width)  // %3
+#endif
+      : "r"(c),                      // %4
+        "m"(kPermdARGBToY_AVX512BW)  // %5
+      : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
+        "zmm7", "zmm16");
+}
+#endif  // HAS_ARGBTOUV444ROW_AVX512BW
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
 
 // ARGBARGB to AARRGGBB shuffle
@@ -1904,6 +1986,16 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUV444ROW_AVX2
 
+#ifdef HAS_ARGBTOUV444ROW_AVX512BW
+void ARGBToUV444Row_AVX512BW(const uint8_t* src_argb,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width) {
+  ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width,
+                                &kArgbI601Constants);
+}
+#endif  // HAS_ARGBTOUV444ROW_AVX512BW
+
 #ifdef HAS_ARGBTOUVROW_SSSE3
 void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
@@ -1981,6 +2073,16 @@ void ARGBToUVJ444Row_AVX2(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBTOUVJ444ROW_AVX2
 
+#ifdef HAS_ARGBTOUVJ444ROW_AVX512BW
+void ARGBToUVJ444Row_AVX512BW(const uint8_t* src_argb,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width) {
+  ARGBToUV444MatrixRow_AVX512BW(src_argb, dst_u, dst_v, width,
+                                &kArgbJPEGConstants);
+}
+#endif  // HAS_ARGBTOUVJ444ROW_AVX512BW
+
 #ifdef HAS_ARGBTOUVJROW_SSSE3
 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,