From 889613683a2a064fc04136e8af9135f974611fd8 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 13 Jun 2025 13:07:20 -0700 Subject: [PATCH] Add hybrid detect for Intel laptop cpus - Add +i8mm build option for sve ARGBToUV which uses usdot - util/cpuid Get cpu count (windows, macos, linux) - For each x86 cpu, detect hybrid (e-core) - Includes a comment fix for ubsan unittest - Bump version - Apply clang format to util/*.c as well as all *.cc/*.h Bug: 424637372 Change-Id: I08310e18051fff62c9e4e4a10d1e4361871119ac Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6635640 Reviewed-by: Wan-Teh Chang --- BUILD.gn | 2 +- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/compare_neon64.cc | 3 +- source/row_any.cc | 2 +- source/row_gcc.cc | 24 ++++++++----- source/row_neon.cc | 2 +- unit_test/convert_argb_test.cc | 6 ++-- util/cpuid.c | 61 ++++++++++++++++++++++++++-------- util/yuvconstants.c | 24 ++++++++----- 10 files changed, 88 insertions(+), 40 deletions(-) diff --git a/BUILD.gn b/BUILD.gn index f9e3caea7..aef66ce9a 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -254,7 +254,7 @@ if (libyuv_use_sve) { public_configs = [ ":libyuv_config" ] # SVE2 is an Armv9-A feature. - cflags = [ "-march=armv9-a+sve2" ] + cflags = [ "-march=armv9-a+sve2+i8mm" ] } } diff --git a/README.chromium b/README.chromium index f88b071a7..e8d8d566b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1911 +Version: 1912 License: BSD-3-Clause License File: LICENSE Shipped: yes diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d61f39306..8cd8ee6e4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1911 +#define LIBYUV_VERSION 1912 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 6e56e4274..756f83cb3 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -116,8 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; const uint32_t c16 = 0x92d9e201; // 33^16 uint32_t tmp, tmp2; - asm( - "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" // count is always a multiple of 16. diff --git a/source/row_any.cc b/source/row_any.cc index cb4290faf..a1b1fc13a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2046,7 +2046,7 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ } \ ptrdiff_t np = n; \ - memcpy(vin, src_ptr, r * BPP); \ + memcpy(vin, src_ptr, r* BPP); \ ANY_SIMD(vin, vout, MASK + 1); \ memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index fe4dce883..6fa8261af 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1650,7 +1650,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %3 #else - "+rm"(width) // %3 + "+rm"(width) // %3 #endif : "m"(rgbuvconstants->kRGBToU), // %4 "m"(rgbuvconstants->kRGBToV), // %5 @@ -1721,7 +1721,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %3 #else - "+rm"(width) // %3 + "+rm"(width) // %3 #endif : "m"(rgbuvconstants->kRGBToU), // %4 "m"(rgbuvconstants->kRGBToV), // %5 @@ -1734,9 +1734,13 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBTOUVROW_SSSE3 -void OMITFP ARGBToUVMatrixRow_SSSE3( - const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, - uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { +void OMITFP +ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { asm volatile( "movdqa %0,%%xmm3 \n" "movdqa %1,%%xmm4 \n" @@ -1821,9 +1825,13 @@ static const UVMatrixConstants kShufARGBToUV_AVX = { 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128}; -void OMITFP ARGBToUVMatrixRow_AVX2( - const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, - uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { +void OMITFP +ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { asm volatile( "vbroadcastf128 %0,%%ymm6 \n" "vbroadcastf128 %1,%%ymm7 \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index 74cc8a939..359cbf40f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -271,7 +271,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, "subs %[width], %[width], #8 \n" // YUVTORGB // RGBTORGB8 // - STORERGBA // + STORERGBA // "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 0ef4bd954..78a6c079a 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -2731,9 +2731,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { // Allocate one extra column so that the coalesce optimizations do not trigger // in convert_argb.cc (they are triggered only when stride is equal to width). const size_t kStride = kWidth + 1; - align_buffer_page_end(orig_i400, (size_t) kWidth * kHeight); + align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight); ASSERT_NE(orig_i400, nullptr); - align_buffer_page_end(dest_argb, (size_t) kWidth * kHeight * 4); + align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4); ASSERT_NE(dest_argb, nullptr); for (int i = 0; i < kWidth * kHeight; ++i) { orig_i400[i] = i % 256; @@ -2744,7 +2744,7 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { free_aligned_buffer_page_end(dest_argb); free_aligned_buffer_page_end(orig_i400); } -#endif // defined(_M_X64) || defined(_M_X64) || defined(__aarch64__) +#endif // defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) #endif // !defined(LEAN_TESTS) diff --git a/util/cpuid.c b/util/cpuid.c index de5ff9c96..df1be880c 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -15,6 +15,13 @@ #ifdef __linux__ #include #include +#include // for sysconf +#endif +#if defined(_WIN32) +#include // for GetSystemInfo +#endif +#if defined(__APPLE__) +#include // for sysctlbyname #endif #include "libyuv/cpu_id.h" @@ -24,16 +31,16 @@ using namespace libyuv; #endif #ifdef __linux__ -static void KernelVersion(int *version) { +static void KernelVersion(int* version) { struct utsname buffer; int i = 0; version[0] = version[1] = 0; if (uname(&buffer) == 0) { - char *v = buffer.release; + char* v = buffer.release; for (i = 0; *v && i < 2; ++v) { if (isdigit(*v)) { - version[i++] = (int) strtol(v, &v, 10); + version[i++] = (int)strtol(v, &v, 10); } } } @@ -51,6 +58,23 @@ int main(int argc, const char* argv[]) { printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]); } #endif // defined(__linux__) +#if defined(_WIN32) + SYSTEM_INFO sysInfo; + GetSystemInfo(&sysInfo); + int num_cpus = (int)sysInfo.dwNumberOfProcessors; +#elif defined(__linux__) + int num_cpus = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(__APPLE__) + int num_cpus = 0; + size_t num_cpus_len = sizeof(num_cpus); + // Get the number of logical CPU cores + if (sysctlbyname("hw.logicalcpu", &num_cpus, &num_cpus_len, NULL, 0) == -1) { + printf("sysctlbyname failed to get hw.logicalcpu\n"); + } +#else + int num_cpus = 0; // unknown OS +#endif + printf("Number of cpus: %d\n", num_cpus); #if defined(__arm__) || defined(__aarch64__) int has_arm = TestCpuFlag(kCpuHasARM); @@ -75,7 +99,8 @@ int main(int argc, const char* argv[]) { // Read and print the SVE and SME vector lengths. if (has_sve) { int sve_vl; - __asm__(".inst 0x04bf5020 \n" // rdvl x0, #1 + __asm__( + ".inst 0x04bf5020 \n" // rdvl x0, #1 "mov %w[sve_vl], w0 \n" : [sve_vl] "=r"(sve_vl) // %[sve_vl] : @@ -84,7 +109,8 @@ int main(int argc, const char* argv[]) { } if (has_sme) { int sme_vl; - __asm__(".inst 0x04bf5820 \n" // rdsvl x0, #1 + __asm__( + ".inst 0x04bf5820 \n" // rdsvl x0, #1 "mov %w[sme_vl], w0 \n" : [sme_vl] "=r"(sme_vl) // %[sme_vl] : @@ -104,8 +130,8 @@ int main(int argc, const char* argv[]) { // Read and print the RVV vector length. if (has_rvv) { - register uint32_t vlenb __asm__ ("t0"); - __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); + register uint32_t vlenb __asm__("t0"); + __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb)); printf("RVV vector length: %d bytes\n", vlenb); } } @@ -123,7 +149,7 @@ int main(int argc, const char* argv[]) { #if defined(__loongarch__) int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); if (has_loongarch) { - int has_lsx = TestCpuFlag(kCpuHasLSX); + int has_lsx = TestCpuFlag(kCpuHasLSX); int has_lasx = TestCpuFlag(kCpuHasLASX); printf("Has LOONGARCH 0x%x\n", has_loongarch); printf("Has LSX 0x%x\n", has_lsx); @@ -131,8 +157,8 @@ int main(int argc, const char* argv[]) { } #endif // defined(__loongarch__) -#if defined(__i386__) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_X64) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { int family, model, cpu_info[4]; @@ -153,6 +179,13 @@ int main(int argc, const char* argv[]) { cpu_info[3] = 0; printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0])); + for (int n = 0; n < num_cpus; ++n) { + // Check EDX bit 15 for hybrid design indication + CpuId(7, n, &cpu_info[0]); + int hybrid = (cpu_info[3] >> 15) & 1; + printf(" Cpu %d Hybrid %d\n", n, hybrid); + } + // CPU Family and Model // 3:0 - Stepping // 7:4 - Model @@ -163,8 +196,8 @@ int main(int argc, const char* argv[]) { CpuId(1, 0, &cpu_info[0]); family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); - printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, - model, model); + printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, + model); int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); @@ -210,7 +243,7 @@ int main(int argc, const char* argv[]) { printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); printf("Has AMXINT8 0x%x\n", has_amxint8); } -#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || + // defined(_M_X64) return 0; } - diff --git a/util/yuvconstants.c b/util/yuvconstants.c index 4e5185af6..5f52d7553 100644 --- a/util/yuvconstants.c +++ b/util/yuvconstants.c @@ -76,10 +76,14 @@ int main(int argc, const char* argv[]) { // // // U and V contributions to R,G,B. - printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); - printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); - printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); - printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); + printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, + ub * 64); + printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, + ug * 64); + printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, + vg * 64); + printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, + vr * 64); vr = 255.f / 224.f * 2 * (1 - kr); ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg); @@ -97,10 +101,14 @@ int main(int argc, const char* argv[]) { // // // U and V contributions to R,G,B. - printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); - printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); - printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); - printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); + printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, + ub * 64); + printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, + ug * 64); + printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, + vg * 64); + printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, + vr * 64); return 0; }