J420ToI420 AVX2

libyuv_test '--gunit_filter=*J420ToI420*' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Skylake Xeon AVX2 J420ToI420_Opt (114 ms) C J420ToI420_Opt (596 ms) Sapphire Rapids AVX2 J420ToI420_Opt (126 ms) C J420ToI420_Opt (717 ms) Samsung S23 NEON J420ToI420_Opt (46 ms) C J420ToI420_Opt (95 ms) Bug: 381327032 Change-Id: I2b551507c2a8b1da4f04651b622fc9247a75050d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6201239 Reviewed-by: Justin Green <greenjustin@google.com>
2026-01-01 03:12:16 +08:00 · 2025-01-26 16:25:46 -08:00 · 2025-01-26 16:25:46 -08:00 · 6c2415bfab
commit 6c2415bfab
parent 67f3f17d9a
6 changed files with 61 additions and 23 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1900
+Version: 1901
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -301,6 +301,7 @@ extern "C" {
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_CONVERT8TO8ROW_AVX2
 #define HAS_DETILEROW_16_AVX
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
@ -3657,6 +3658,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
                             int scale,
                             int bias,
                             int width);
+void Convert8To8Row_AVX2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int scale,
+                             int bias,
+                             int width);

 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1900
+#define LIBYUV_VERSION 1901

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -271,19 +271,6 @@ void Convert8To8Plane(const uint8_t* src_y,
    }
  }
 #endif
-#if defined(HAS_CONVERT8TO8ROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    Convert8To8Row = Convert8To8Row_SME;
-  }
-#endif
-#if defined(HAS_CONVERT8TO8ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Convert8To8Row = Convert8To8Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      Convert8To8Row = Convert8To8Row_SSSE3;
-    }
-  }
-#endif
 #if defined(HAS_CONVERT8TO8ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    Convert8To8Row = Convert8To8Row_Any_AVX2;
@ -292,14 +279,6 @@ void Convert8To8Plane(const uint8_t* src_y,
    }
  }
 #endif
-#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    Convert8To8Row = Convert8To8Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      Convert8To8Row = Convert8To8Row_AVX512BW;
-    }
-  }
-#endif

  // Convert plane
  for (y = 0; y < height; ++y) {
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1806,6 +1806,15 @@ ANY11SB(Convert8To8Row_Any_NEON,
        uint8_t,
        31)
 #endif
+#ifdef HAS_CONVERT8TO8ROW_AVX2
+ANY11SB(Convert8To8Row_Any_AVX2,
+        Convert8To8Row_AVX2,
+        1,
+        1,
+        uint8_t,
+        uint8_t,
+        31)
+#endif
 #undef ANY11B

 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -9710,6 +9710,45 @@ void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
      : "memory", "cc", "xmm0", "xmm1");
 }

+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert8To8Row_AVX2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vmovd       %3,%%xmm2                     \n"
+      "vmovd       %4,%%xmm3                     \n"
+      "vpbroadcastw %%xmm2,%%ymm2                \n"
+      "vpbroadcastb %%xmm3,%%ymm3                \n"
+      "vpxor       %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsllw      $8,%%ymm2,%%ymm2              \n"
+
+      // 32 pixels per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpunpckhbw  %%ymm4,%%ymm0,%%ymm1          \n"  // mutates
+      "vpunpcklbw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // unmutates
+      "vpaddb      %%ymm3,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale),   // %3
+        "r"(bias)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus