From 889613683a2a064fc04136e8af9135f974611fd8 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Fri, 13 Jun 2025 13:07:20 -0700
Subject: [PATCH] Add hybrid detect for Intel laptop cpus

- Add +i8mm build option for sve ARGBToUV which uses usdot
- util/cpuid Get cpu count (windows, macos, linux)
- For each x86 cpu, detect hybrid (e-core)
- Includes a comment fix for ubsan unittest
- Bump version
- Apply clang format to util/*.c as well as all *.cc/*.h

Bug: 424637372
Change-Id: I08310e18051fff62c9e4e4a10d1e4361871119ac
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6635640
Reviewed-by: Wan-Teh Chang <wtc@google.com>
---
 BUILD.gn                       |  2 +-
 README.chromium                |  2 +-
 include/libyuv/version.h       |  2 +-
 source/compare_neon64.cc       |  3 +-
 source/row_any.cc              |  2 +-
 source/row_gcc.cc              | 24 ++++++++-----
 source/row_neon.cc             |  2 +-
 unit_test/convert_argb_test.cc |  6 ++--
 util/cpuid.c                   | 61 ++++++++++++++++++++++++++--------
 util/yuvconstants.c            | 24 ++++++++-----
 10 files changed, 88 insertions(+), 40 deletions(-)

diff --git a/BUILD.gn b/BUILD.gn
index f9e3caea7..aef66ce9a 100644
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -254,7 +254,7 @@ if (libyuv_use_sve) {
     public_configs = [ ":libyuv_config" ]
 
     # SVE2 is an Armv9-A feature.
-    cflags = [ "-march=armv9-a+sve2" ]
+    cflags = [ "-march=armv9-a+sve2+i8mm" ]
   }
 }
 
diff --git a/README.chromium b/README.chromium
index f88b071a7..e8d8d566b 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1911
+Version: 1912
 License: BSD-3-Clause
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index d61f39306..8cd8ee6e4 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1911
+#define LIBYUV_VERSION 1912
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 6e56e4274..756f83cb3 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -116,8 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
   uint32_t hash = seed;
   const uint32_t c16 = 0x92d9e201;  // 33^16
   uint32_t tmp, tmp2;
-      asm(
-      "ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
+      asm("ld1         {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
       "ld1         {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
 
       // count is always a multiple of 16.
diff --git a/source/row_any.cc b/source/row_any.cc
index cb4290faf..a1b1fc13a 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -2046,7 +2046,7 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
       ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                        \
     }                                                                 \
     ptrdiff_t np = n;                                                 \
-    memcpy(vin, src_ptr, r * BPP);                                    \
+    memcpy(vin, src_ptr, r* BPP);                                     \
     ANY_SIMD(vin, vout, MASK + 1);                                    \
     memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
   }
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index fe4dce883..6fa8261af 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1650,7 +1650,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %3
 #else
-        "+rm"(width)  // %3
+        "+rm"(width)           // %3
 #endif
       : "m"(rgbuvconstants->kRGBToU),  // %4
         "m"(rgbuvconstants->kRGBToV),  // %5
@@ -1721,7 +1721,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 #if defined(__i386__)
         "+m"(width)  // %3
 #else
-        "+rm"(width)  // %3
+        "+rm"(width)           // %3
 #endif
       : "m"(rgbuvconstants->kRGBToU),  // %4
         "m"(rgbuvconstants->kRGBToV),  // %5
@@ -1734,9 +1734,13 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
 
-void OMITFP ARGBToUVMatrixRow_SSSE3(
-    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
-    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
+void OMITFP
+ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width,
+                        const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
       "movdqa      %0,%%xmm3                     \n"
       "movdqa      %1,%%xmm4                     \n"
@@ -1821,9 +1825,13 @@ static const UVMatrixConstants kShufARGBToUV_AVX = {
     0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128,
     0, 128, 0, 128, 0, 128, 0,  128, 0, 128, 0,  128, 0, 128, 0,  128};
 
-void OMITFP ARGBToUVMatrixRow_AVX2(
-    const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
-    uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
+void OMITFP
+ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width,
+                       const struct RgbUVConstants* rgbuvconstants) {
   asm volatile(
       "vbroadcastf128 %0,%%ymm6                  \n"
       "vbroadcastf128 %1,%%ymm7                  \n"
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 74cc8a939..359cbf40f 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -271,7 +271,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
       "subs        %[width], %[width], #8        \n"  //
       YUVTORGB                                        //
           RGBTORGB8                                   //
-      STORERGBA                                       //
+              STORERGBA                               //
       "bgt         1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc
index 0ef4bd954..78a6c079a 100644
--- a/unit_test/convert_argb_test.cc
+++ b/unit_test/convert_argb_test.cc
@@ -2731,9 +2731,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
   // Allocate one extra column so that the coalesce optimizations do not trigger
   // in convert_argb.cc (they are triggered only when stride is equal to width).
   const size_t kStride = kWidth + 1;
-  align_buffer_page_end(orig_i400, (size_t) kWidth * kHeight);
+  align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight);
   ASSERT_NE(orig_i400, nullptr);
-  align_buffer_page_end(dest_argb, (size_t) kWidth * kHeight * 4);
+  align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4);
   ASSERT_NE(dest_argb, nullptr);
   for (int i = 0; i < kWidth * kHeight; ++i) {
     orig_i400[i] = i % 256;
@@ -2744,7 +2744,7 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
   free_aligned_buffer_page_end(dest_argb);
   free_aligned_buffer_page_end(orig_i400);
 }
-#endif  // defined(_M_X64) || defined(_M_X64) || defined(__aarch64__)
+#endif  // defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)
 
 #endif  // !defined(LEAN_TESTS)
 
diff --git a/util/cpuid.c b/util/cpuid.c
index de5ff9c96..df1be880c 100644
--- a/util/cpuid.c
+++ b/util/cpuid.c
@@ -15,6 +15,13 @@
 #ifdef __linux__
 #include <ctype.h>
 #include <sys/utsname.h>
+#include <unistd.h>  // for sysconf
+#endif
+#if defined(_WIN32)
+#include <windows.h>  // for GetSystemInfo
+#endif
+#if defined(__APPLE__)
+#include <sys/sysctl.h>  // for sysctlbyname
 #endif
 
 #include "libyuv/cpu_id.h"
@@ -24,16 +31,16 @@ using namespace libyuv;
 #endif
 
 #ifdef __linux__
-static void KernelVersion(int *version) {
+static void KernelVersion(int* version) {
   struct utsname buffer;
   int i = 0;
 
   version[0] = version[1] = 0;
   if (uname(&buffer) == 0) {
-    char *v = buffer.release;
+    char* v = buffer.release;
     for (i = 0; *v && i < 2; ++v) {
       if (isdigit(*v)) {
-        version[i++] = (int) strtol(v, &v, 10);
+        version[i++] = (int)strtol(v, &v, 10);
       }
     }
   }
@@ -51,6 +58,23 @@ int main(int argc, const char* argv[]) {
     printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
   }
 #endif  // defined(__linux__)
+#if defined(_WIN32)
+  SYSTEM_INFO sysInfo;
+  GetSystemInfo(&sysInfo);
+  int num_cpus = (int)sysInfo.dwNumberOfProcessors;
+#elif defined(__linux__)
+  int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined(__APPLE__)
+  int num_cpus = 0;
+  size_t num_cpus_len = sizeof(num_cpus);
+  // Get the number of logical CPU cores
+  if (sysctlbyname("hw.logicalcpu", &num_cpus, &num_cpus_len, NULL, 0) == -1) {
+    printf("sysctlbyname failed to get hw.logicalcpu\n");
+  }
+#else
+  int num_cpus = 0;  // unknown OS
+#endif
+  printf("Number of cpus: %d\n", num_cpus);
 
 #if defined(__arm__) || defined(__aarch64__)
   int has_arm = TestCpuFlag(kCpuHasARM);
@@ -75,7 +99,8 @@ int main(int argc, const char* argv[]) {
     // Read and print the SVE and SME vector lengths.
     if (has_sve) {
       int sve_vl;
-      __asm__(".inst 0x04bf5020    \n"  // rdvl x0, #1
+      __asm__(
+          ".inst 0x04bf5020    \n"  // rdvl x0, #1
           "mov %w[sve_vl], w0  \n"
           : [sve_vl] "=r"(sve_vl)  // %[sve_vl]
           :
@@ -84,7 +109,8 @@ int main(int argc, const char* argv[]) {
     }
     if (has_sme) {
       int sme_vl;
-      __asm__(".inst 0x04bf5820    \n"  // rdsvl x0, #1
+      __asm__(
+          ".inst 0x04bf5820    \n"  // rdsvl x0, #1
           "mov %w[sme_vl], w0  \n"
           : [sme_vl] "=r"(sme_vl)  // %[sme_vl]
           :
@@ -104,8 +130,8 @@ int main(int argc, const char* argv[]) {
 
     // Read and print the RVV vector length.
     if (has_rvv) {
-      register uint32_t vlenb __asm__ ("t0");
-      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      register uint32_t vlenb __asm__("t0");
+      __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
       printf("RVV vector length: %d bytes\n", vlenb);
     }
   }
@@ -123,7 +149,7 @@ int main(int argc, const char* argv[]) {
 #if defined(__loongarch__)
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   if (has_loongarch) {
-    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lsx = TestCpuFlag(kCpuHasLSX);
     int has_lasx = TestCpuFlag(kCpuHasLASX);
     printf("Has LOONGARCH 0x%x\n", has_loongarch);
     printf("Has LSX 0x%x\n", has_lsx);
@@ -131,8 +157,8 @@ int main(int argc, const char* argv[]) {
   }
 #endif  // defined(__loongarch__)
 
-#if defined(__i386__) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_X64)
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(_M_X64)
   int has_x86 = TestCpuFlag(kCpuHasX86);
   if (has_x86) {
     int family, model, cpu_info[4];
@@ -153,6 +179,13 @@ int main(int argc, const char* argv[]) {
     cpu_info[3] = 0;
     printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
 
+    for (int n = 0; n < num_cpus; ++n) {
+      // Check EDX bit 15 for hybrid design indication
+      CpuId(7, n, &cpu_info[0]);
+      int hybrid = (cpu_info[3] >> 15) & 1;
+      printf("  Cpu %d Hybrid %d\n", n, hybrid);
+    }
+
     // CPU Family and Model
     // 3:0 - Stepping
     // 7:4 - Model
@@ -163,8 +196,8 @@ int main(int argc, const char* argv[]) {
     CpuId(1, 0, &cpu_info[0]);
     family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
     model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
-    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
-           model, model);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
+           model);
 
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
     int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
@@ -210,7 +243,7 @@ int main(int argc, const char* argv[]) {
     printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
     printf("Has AMXINT8 0x%x\n", has_amxint8);
   }
-#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
+        // defined(_M_X64)
   return 0;
 }
-
diff --git a/util/yuvconstants.c b/util/yuvconstants.c
index 4e5185af6..5f52d7553 100644
--- a/util/yuvconstants.c
+++ b/util/yuvconstants.c
@@ -76,10 +76,14 @@ int main(int argc, const char* argv[]) {
   //
   // // U and V contributions to R,G,B.
 
-  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
-  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
-  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
-  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub,
+         ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug,
+         ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg,
+         vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr,
+         vr * 64);
 
   vr = 255.f / 224.f * 2 * (1 - kr);
   ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
@@ -97,10 +101,14 @@ int main(int argc, const char* argv[]) {
   //
   // // U and V contributions to R,G,B.
 
-  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
-  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
-  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
-  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
+  printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub,
+         ub * 64);
+  printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug,
+         ug * 64);
+  printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg,
+         vg * 64);
+  printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr,
+         vr * 64);
 
   return 0;
 }