From 6c2415bfab36fced80403ac1a89eab96fb289f7d Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Sun, 26 Jan 2025 16:25:46 -0800
Subject: [PATCH] J420ToI420 AVX2

libyuv_test '--gunit_filter=*J420ToI420*' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Skylake Xeon
AVX2 J420ToI420_Opt (114 ms)
C    J420ToI420_Opt (596 ms)

Sapphire Rapids
AVX2 J420ToI420_Opt (126 ms)
C    J420ToI420_Opt (717 ms)

Samsung S23
NEON J420ToI420_Opt (46 ms)
C    J420ToI420_Opt (95 ms)

Bug: 381327032
Change-Id: I2b551507c2a8b1da4f04651b622fc9247a75050d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6201239
Reviewed-by: Justin Green <greenjustin@google.com>
---
 README.chromium            |  2 +-
 include/libyuv/row.h       | 11 +++++++++++
 include/libyuv/version.h   |  2 +-
 source/planar_functions.cc | 21 --------------------
 source/row_any.cc          |  9 +++++++++
 source/row_gcc.cc          | 39 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/README.chromium b/README.chromium
index 18e76cc59..40f197547 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1900
+Version: 1901
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 7fce069ff..98808b75e 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -301,6 +301,7 @@ extern "C" {
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_CONVERT8TO8ROW_AVX2
 #define HAS_DETILEROW_16_AVX
 #define HAS_DIVIDEROW_16_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
@@ -3657,6 +3658,16 @@ void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
                              int scale,
                              int bias,
                              int width);
+void Convert8To8Row_AVX2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int scale,
+                             int bias,
+                             int width);
 
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index adf3e8538..3e514eab9 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1900
+#define LIBYUV_VERSION 1901
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index f0763c41f..ca0bfea90 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -271,19 +271,6 @@ void Convert8To8Plane(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_CONVERT8TO8ROW_SME)
-  if (TestCpuFlag(kCpuHasSME)) {
-    Convert8To8Row = Convert8To8Row_SME;
-  }
-#endif
-#if defined(HAS_CONVERT8TO8ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    Convert8To8Row = Convert8To8Row_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      Convert8To8Row = Convert8To8Row_SSSE3;
-    }
-  }
-#endif
 #if defined(HAS_CONVERT8TO8ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
     Convert8To8Row = Convert8To8Row_Any_AVX2;
@@ -292,14 +279,6 @@ void Convert8To8Plane(const uint8_t* src_y,
     }
   }
 #endif
-#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
-  if (TestCpuFlag(kCpuHasAVX512BW)) {
-    Convert8To8Row = Convert8To8Row_Any_AVX512BW;
-    if (IS_ALIGNED(width, 64)) {
-      Convert8To8Row = Convert8To8Row_AVX512BW;
-    }
-  }
-#endif
 
   // Convert plane
   for (y = 0; y < height; ++y) {
diff --git a/source/row_any.cc b/source/row_any.cc
index 8344aa35f..e994d694e 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1806,6 +1806,15 @@ ANY11SB(Convert8To8Row_Any_NEON,
         uint8_t,
         31)
 #endif
+#ifdef HAS_CONVERT8TO8ROW_AVX2
+ANY11SB(Convert8To8Row_Any_AVX2,
+        Convert8To8Row_AVX2,
+        1,
+        1,
+        uint8_t,
+        uint8_t,
+        31)
+#endif
 #undef ANY11B
 
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index c2ad5b8f5..81f35f861 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -9710,6 +9710,45 @@ void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) {
       : "memory", "cc", "xmm0", "xmm1");
 }
 
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert8To8Row_AVX2(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  asm volatile(
+      "sub         %0,%1                         \n"
+      "vmovd       %3,%%xmm2                     \n"
+      "vmovd       %4,%%xmm3                     \n"
+      "vpbroadcastw %%xmm2,%%ymm2                \n"
+      "vpbroadcastb %%xmm3,%%ymm3                \n"
+      "vpxor       %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsllw      $8,%%ymm2,%%ymm2              \n"
+
+      // 32 pixels per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu     (%0),%%ymm0                   \n"
+      "vpunpckhbw  %%ymm4,%%ymm0,%%ymm1          \n"  // mutates
+      "vpunpcklbw  %%ymm4,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
+      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpackuswb   %%ymm1,%%ymm0,%%ymm0          \n"  // unmutates
+      "vpaddb      %%ymm3,%%ymm0,%%ymm0          \n"
+      "vmovdqu     %%ymm0,(%0,%1)                \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x20,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale),   // %3
+        "r"(bias)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus