Add hybrid detect for Intel laptop cpus

- Add +i8mm build option for sve ARGBToUV which uses usdot
- util/cpuid Get cpu count (windows, macos, linux)
- For each x86 cpu, detect hybrid (e-core)
- Includes a comment fix for ubsan unittest
- Bump version
- Apply clang format to util/*.c as well as all *.cc/*.h

Bug: 424637372
Change-Id: I08310e18051fff62c9e4e4a10d1e4361871119ac
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6635640
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-06-13 13:07:20 -07:00
parent 3d66e94fb5
commit 889613683a
10 changed files with 88 additions and 40 deletions

View File

@ -254,7 +254,7 @@ if (libyuv_use_sve) {
public_configs = [ ":libyuv_config" ]
# SVE2 is an Armv9-A feature.
cflags = [ "-march=armv9-a+sve2" ]
cflags = [ "-march=armv9-a+sve2+i8mm" ]
}
}

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1911
Version: 1912
License: BSD-3-Clause
License File: LICENSE
Shipped: yes

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1911
#define LIBYUV_VERSION 1912
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -116,8 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash = seed;
const uint32_t c16 = 0x92d9e201; // 33^16
uint32_t tmp, tmp2;
asm(
"ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
// count is always a multiple of 16.

View File

@ -2046,7 +2046,7 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2,
ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \
} \
ptrdiff_t np = n; \
memcpy(vin, src_ptr, r * BPP); \
memcpy(vin, src_ptr, r* BPP); \
ANY_SIMD(vin, vout, MASK + 1); \
memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \
}

View File

@ -1650,7 +1650,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3
"+rm"(width) // %3
#endif
: "m"(rgbuvconstants->kRGBToU), // %4
"m"(rgbuvconstants->kRGBToV), // %5
@ -1721,7 +1721,7 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3
"+rm"(width) // %3
#endif
: "m"(rgbuvconstants->kRGBToU), // %4
"m"(rgbuvconstants->kRGBToV), // %5
@ -1734,9 +1734,13 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#ifdef HAS_ARGBTOUVROW_SSSE3
void OMITFP ARGBToUVMatrixRow_SSSE3(
const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
void OMITFP
ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
@ -1821,9 +1825,13 @@ static const UVMatrixConstants kShufARGBToUV_AVX = {
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128};
void OMITFP ARGBToUVMatrixRow_AVX2(
const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
void OMITFP
ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"vbroadcastf128 %0,%%ymm6 \n"
"vbroadcastf128 %1,%%ymm7 \n"

View File

@ -271,7 +271,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
"subs %[width], %[width], #8 \n" //
YUVTORGB //
RGBTORGB8 //
STORERGBA //
STORERGBA //
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]

View File

@ -2731,9 +2731,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
// Allocate one extra column so that the coalesce optimizations do not trigger
// in convert_argb.cc (they are triggered only when stride is equal to width).
const size_t kStride = kWidth + 1;
align_buffer_page_end(orig_i400, (size_t) kWidth * kHeight);
align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight);
ASSERT_NE(orig_i400, nullptr);
align_buffer_page_end(dest_argb, (size_t) kWidth * kHeight * 4);
align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4);
ASSERT_NE(dest_argb, nullptr);
for (int i = 0; i < kWidth * kHeight; ++i) {
orig_i400[i] = i % 256;
@ -2744,7 +2744,7 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
free_aligned_buffer_page_end(dest_argb);
free_aligned_buffer_page_end(orig_i400);
}
#endif // defined(_M_X64) || defined(_M_X64) || defined(__aarch64__)
#endif // defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)
#endif // !defined(LEAN_TESTS)

View File

@ -15,6 +15,13 @@
#ifdef __linux__
#include <ctype.h>
#include <sys/utsname.h>
#include <unistd.h> // for sysconf
#endif
#if defined(_WIN32)
#include <windows.h> // for GetSystemInfo
#endif
#if defined(__APPLE__)
#include <sys/sysctl.h> // for sysctlbyname
#endif
#include "libyuv/cpu_id.h"
@ -24,16 +31,16 @@ using namespace libyuv;
#endif
#ifdef __linux__
static void KernelVersion(int *version) {
static void KernelVersion(int* version) {
struct utsname buffer;
int i = 0;
version[0] = version[1] = 0;
if (uname(&buffer) == 0) {
char *v = buffer.release;
char* v = buffer.release;
for (i = 0; *v && i < 2; ++v) {
if (isdigit(*v)) {
version[i++] = (int) strtol(v, &v, 10);
version[i++] = (int)strtol(v, &v, 10);
}
}
}
@ -51,6 +58,23 @@ int main(int argc, const char* argv[]) {
printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
}
#endif // defined(__linux__)
#if defined(_WIN32)
SYSTEM_INFO sysInfo;
GetSystemInfo(&sysInfo);
int num_cpus = (int)sysInfo.dwNumberOfProcessors;
#elif defined(__linux__)
int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(__APPLE__)
int num_cpus = 0;
size_t num_cpus_len = sizeof(num_cpus);
// Get the number of logical CPU cores
if (sysctlbyname("hw.logicalcpu", &num_cpus, &num_cpus_len, NULL, 0) == -1) {
printf("sysctlbyname failed to get hw.logicalcpu\n");
}
#else
int num_cpus = 0; // unknown OS
#endif
printf("Number of cpus: %d\n", num_cpus);
#if defined(__arm__) || defined(__aarch64__)
int has_arm = TestCpuFlag(kCpuHasARM);
@ -75,7 +99,8 @@ int main(int argc, const char* argv[]) {
// Read and print the SVE and SME vector lengths.
if (has_sve) {
int sve_vl;
__asm__(".inst 0x04bf5020 \n" // rdvl x0, #1
__asm__(
".inst 0x04bf5020 \n" // rdvl x0, #1
"mov %w[sve_vl], w0 \n"
: [sve_vl] "=r"(sve_vl) // %[sve_vl]
:
@ -84,7 +109,8 @@ int main(int argc, const char* argv[]) {
}
if (has_sme) {
int sme_vl;
__asm__(".inst 0x04bf5820 \n" // rdsvl x0, #1
__asm__(
".inst 0x04bf5820 \n" // rdsvl x0, #1
"mov %w[sme_vl], w0 \n"
: [sme_vl] "=r"(sme_vl) // %[sme_vl]
:
@ -104,8 +130,8 @@ int main(int argc, const char* argv[]) {
// Read and print the RVV vector length.
if (has_rvv) {
register uint32_t vlenb __asm__ ("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb));
register uint32_t vlenb __asm__("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
printf("RVV vector length: %d bytes\n", vlenb);
}
}
@ -123,7 +149,7 @@ int main(int argc, const char* argv[]) {
#if defined(__loongarch__)
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
if (has_loongarch) {
int has_lsx = TestCpuFlag(kCpuHasLSX);
int has_lsx = TestCpuFlag(kCpuHasLSX);
int has_lasx = TestCpuFlag(kCpuHasLASX);
printf("Has LOONGARCH 0x%x\n", has_loongarch);
printf("Has LSX 0x%x\n", has_lsx);
@ -131,8 +157,8 @@ int main(int argc, const char* argv[]) {
}
#endif // defined(__loongarch__)
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_X64)
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
int family, model, cpu_info[4];
@ -153,6 +179,13 @@ int main(int argc, const char* argv[]) {
cpu_info[3] = 0;
printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
for (int n = 0; n < num_cpus; ++n) {
// Check EDX bit 15 for hybrid design indication
CpuId(7, n, &cpu_info[0]);
int hybrid = (cpu_info[3] >> 15) & 1;
printf(" Cpu %d Hybrid %d\n", n, hybrid);
}
// CPU Family and Model
// 3:0 - Stepping
// 7:4 - Model
@ -163,8 +196,8 @@ int main(int argc, const char* argv[]) {
CpuId(1, 0, &cpu_info[0]);
family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
model, model);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
model);
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
@ -210,7 +243,7 @@ int main(int argc, const char* argv[]) {
printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
printf("Has AMXINT8 0x%x\n", has_amxint8);
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
// defined(_M_X64)
return 0;
}

View File

@ -76,10 +76,14 @@ int main(int argc, const char* argv[]) {
//
// // U and V contributions to R,G,B.
printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub,
ub * 64);
printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug,
ug * 64);
printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg,
vg * 64);
printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr,
vr * 64);
vr = 255.f / 224.f * 2 * (1 - kr);
ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
@ -97,10 +101,14 @@ int main(int argc, const char* argv[]) {
//
// // U and V contributions to R,G,B.
printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64);
printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64);
printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64);
printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64);
printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub,
ub * 64);
printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug,
ug * 64);
printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg,
vg * 64);
printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr,
vr * 64);
return 0;
}