mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
- Add kCpuHasAVXVNNI flag - Remove deprecated GFNI detect to make space. Meteor Lake has AVX-VNNI but not AVX512 ~/intelsde/sde -mtl -- blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*CpuHas doyuv3 Note: Google Test filter = *CpuHas [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from LibYUVBaseTest [ RUN ] LibYUVBaseTest.TestCpuHas Cpu Flags 0x203ff1 Has X86 0x10 Has SSE2 0x20 Has SSSE3 0x40 Has SSE41 0x80 Has SSE42 0x100 Has AVX 0x200 Has AVX2 0x400 Has ERMS 0x800 Has FMA3 0x1000 Has F16C 0x2000 Has AVX512BW 0x0 Has AVX512VL 0x0 Has AVX512VNNI 0x0 Has AVX512VBMI 0x0 Has AVX512VBMI2 0x0 Has AVX512VBITALG 0x0 Has AVX512VPOPCNTDQ 0x0 HAS AVXVNNI 0x200000 Has AVXVNNIINT8 0x0 AVX-VNNI detect - Add kCpuHasAVXVNNI flag - Remove deprecated GFNI detect to make space. https://bugs.chromium.org/p/libyuv/issues/detail?id=967 Meteor Lake has AVX-VNNI but not AVX512 ~/intelsde/sde -mtl -- blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*CpuHas doyuv3 Note: Google Test filter = *CpuHas [==========] Running 1 test from 1 test suite. [----------] Global test environment set-up. [----------] 1 test from LibYUVBaseTest [ RUN ] LibYUVBaseTest.TestCpuHas Cpu Flags 0x203ff1 Has X86 0x10 Has SSE2 0x20 Has SSSE3 0x40 Has SSE41 0x80 Has SSE42 0x100 Has AVX 0x200 Has AVX2 0x400 Has ERMS 0x800 Has FMA3 0x1000 Has F16C 0x2000 Has AVX512BW 0x0 Has AVX512VL 0x0 Has AVX512VNNI 0x0 Has AVX512VBMI 0x0 Has AVX512VBMI2 0x0 Has AVX512VBITALG 0x0 Has AVX512VPOPCNTDQ 0x0 HAS AVXVNNI 0x200000 Has AVXVNNIINT8 0x0 Running on all cpus the following report avx-vnni grep 'AVXVNNI 0x2' */* adl/libyuv64.txt:HAS AVXVNNI 0x200000 gnr/libyuv64.txt:HAS AVXVNNI 0x200000 grr/libyuv64.txt:HAS AVXVNNI 0x200000 mtl/libyuv64.txt:HAS AVXVNNI 0x200000 rpl/libyuv64.txt:HAS AVXVNNI 0x200000 spr/libyuv64.txt:HAS AVXVNNI 0x200000 srf/libyuv64.txt:HAS AVXVNNI 0x200000 while these support avx512 vnni grep 'VNNI 0x1' */* clx/libyuv64.txt:Has AVX512VNNI 0x10000 cpx/libyuv64.txt:Has AVX512VNNI 0x10000 gnr/libyuv64.txt:Has AVX512VNNI 0x10000 icl/libyuv64.txt:Has AVX512VNNI 0x10000 icx/libyuv64.txt:Has AVX512VNNI 0x10000 spr/libyuv64.txt:Has AVX512VNNI 0x10000 tgl/libyuv64.txt:Has AVX512VNNI 0x10000 and these support avx-vnni-int8 grep AVXVNNIINT8.0x4 */* grr/libyuv64.txt:Has AVXVNNIINT8 0x400000 srf/libyuv64.txt:Has AVXVNNIINT8 0x400000 Bug: libyuv:967 Change-Id: I84cd71d1b320e7c284173eb695fc1d3b72d14ddb Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4912017 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
380 lines
12 KiB
C++
380 lines
12 KiB
C++
/*
|
|
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "libyuv/cpu_id.h"
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h> // For __cpuidex()
|
|
#endif
|
|
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
|
!defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
|
|
defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
|
#include <immintrin.h> // For _xgetbv()
|
|
#endif
|
|
|
|
// For ArmCpuCaps() but unittested on all platforms
|
|
#include <stdio.h> // For fopen()
|
|
#include <string.h>
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
// For functions that use the stack and have runtime checks for overflow,
|
|
// use SAFEBUFFERS to avoid additional check.
|
|
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \
|
|
!defined(__clang__)
|
|
#define SAFEBUFFERS __declspec(safebuffers)
|
|
#else
|
|
#define SAFEBUFFERS
|
|
#endif
|
|
|
|
// cpu_info_ variable for SIMD instruction sets detected.
|
|
LIBYUV_API int cpu_info_ = 0;
|
|
|
|
// Low level cpuid for X86.
|
|
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
|
|
defined(__x86_64__)) && \
|
|
!defined(__pnacl__) && !defined(__CLR_VER)
|
|
LIBYUV_API
|
|
void CpuId(int info_eax, int info_ecx, int* cpu_info) {
|
|
#if defined(_MSC_VER)
|
|
// Visual C version uses intrinsic or inline x86 assembly.
|
|
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
|
__cpuidex(cpu_info, info_eax, info_ecx);
|
|
#elif defined(_M_IX86)
|
|
__asm {
|
|
mov eax, info_eax
|
|
mov ecx, info_ecx
|
|
mov edi, cpu_info
|
|
cpuid
|
|
mov [edi], eax
|
|
mov [edi + 4], ebx
|
|
mov [edi + 8], ecx
|
|
mov [edi + 12], edx
|
|
}
|
|
#else // Visual C but not x86
|
|
if (info_ecx == 0) {
|
|
__cpuid(cpu_info, info_eax);
|
|
} else {
|
|
cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
|
|
}
|
|
#endif
|
|
// GCC version uses inline x86 assembly.
|
|
#else // defined(_MSC_VER)
|
|
int info_ebx, info_edx;
|
|
asm volatile(
|
|
#if defined(__i386__) && defined(__PIC__)
|
|
// Preserve ebx for fpic 32 bit.
|
|
"mov %%ebx, %%edi \n"
|
|
"cpuid \n"
|
|
"xchg %%edi, %%ebx \n"
|
|
: "=D"(info_ebx),
|
|
#else
|
|
"cpuid \n"
|
|
: "=b"(info_ebx),
|
|
#endif // defined( __i386__) && defined(__PIC__)
|
|
"+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
|
|
cpu_info[0] = info_eax;
|
|
cpu_info[1] = info_ebx;
|
|
cpu_info[2] = info_ecx;
|
|
cpu_info[3] = info_edx;
|
|
#endif // defined(_MSC_VER)
|
|
}
|
|
#else // (defined(_M_IX86) || defined(_M_X64) ...
|
|
LIBYUV_API
|
|
void CpuId(int eax, int ecx, int* cpu_info) {
|
|
(void)eax;
|
|
(void)ecx;
|
|
cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
|
|
}
|
|
#endif
|
|
|
|
// For VS2010 and earlier emit can be used:
|
|
// _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
|
|
// __asm {
|
|
// xor ecx, ecx // xcr 0
|
|
// xgetbv
|
|
// mov xcr0, eax
|
|
// }
|
|
// For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code.
|
|
// https://code.google.com/p/libyuv/issues/detail?id=529
|
|
#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
|
|
#pragma optimize("g", off)
|
|
#endif
|
|
#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
|
|
defined(__x86_64__)) && \
|
|
!defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
|
|
// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
|
|
static int GetXCR0() {
|
|
int xcr0 = 0;
|
|
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
|
|
xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT
|
|
#elif defined(__i386__) || defined(__x86_64__)
|
|
asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
|
|
#endif // defined(__i386__) || defined(__x86_64__)
|
|
return xcr0;
|
|
}
|
|
#else
|
|
// xgetbv unavailable to query for OSSave support. Return 0.
|
|
#define GetXCR0() 0
|
|
#endif // defined(_M_IX86) || defined(_M_X64) ..
|
|
// Return optimization to previous setting.
|
|
#if defined(_M_IX86) && defined(_MSC_VER) && (_MSC_VER < 1900)
|
|
#pragma optimize("g", on)
|
|
#endif
|
|
|
|
// Based on libvpx arm_cpudetect.c
|
|
// For Arm, but public to allow testing on any CPU
|
|
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
|
|
char cpuinfo_line[512];
|
|
FILE* f = fopen(cpuinfo_name, "re");
|
|
if (!f) {
|
|
// Assume Neon if /proc/cpuinfo is unavailable.
|
|
// This will occur for Chrome sandbox for Pepper or Render process.
|
|
return kCpuHasNEON;
|
|
}
|
|
memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
|
|
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
|
|
if (memcmp(cpuinfo_line, "Features", 8) == 0) {
|
|
char* p = strstr(cpuinfo_line, " neon");
|
|
if (p && (p[5] == ' ' || p[5] == '\n')) {
|
|
fclose(f);
|
|
return kCpuHasNEON;
|
|
}
|
|
// aarch64 uses asimd for Neon.
|
|
p = strstr(cpuinfo_line, " asimd");
|
|
if (p) {
|
|
fclose(f);
|
|
return kCpuHasNEON;
|
|
}
|
|
}
|
|
}
|
|
fclose(f);
|
|
return 0;
|
|
}
|
|
|
|
LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
|
|
char cpuinfo_line[512];
|
|
int flag = 0;
|
|
FILE* f = fopen(cpuinfo_name, "re");
|
|
if (!f) {
|
|
#if defined(__riscv_vector)
|
|
// Assume RVV if /proc/cpuinfo is unavailable.
|
|
// This will occur for Chrome sandbox for Pepper or Render process.
|
|
return kCpuHasRVV;
|
|
#else
|
|
return 0;
|
|
#endif
|
|
}
|
|
memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
|
|
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
|
|
if (memcmp(cpuinfo_line, "isa", 3) == 0) {
|
|
// ISA string must begin with rv64{i,e,g} for a 64-bit processor.
|
|
char* isa = strstr(cpuinfo_line, "rv64");
|
|
if (isa) {
|
|
size_t isa_len = strlen(isa);
|
|
char* extensions;
|
|
size_t extensions_len = 0;
|
|
size_t std_isa_len;
|
|
// Remove the new-line character at the end of string
|
|
if (isa[isa_len - 1] == '\n') {
|
|
isa[--isa_len] = '\0';
|
|
}
|
|
// 5 ISA characters
|
|
if (isa_len < 5) {
|
|
fclose(f);
|
|
return 0;
|
|
}
|
|
// Skip {i,e,g} canonical checking.
|
|
// Skip rvxxx
|
|
isa += 5;
|
|
// Find the very first occurrence of 's', 'x' or 'z'.
|
|
// To detect multi-letter standard, non-standard, and
|
|
// supervisor-level extensions.
|
|
extensions = strpbrk(isa, "zxs");
|
|
if (extensions) {
|
|
// Multi-letter extensions are seperated by a single underscore
|
|
// as described in RISC-V User-Level ISA V2.2.
|
|
char* ext = strtok(extensions, "_");
|
|
extensions_len = strlen(extensions);
|
|
while (ext) {
|
|
// Search for the ZVFH (Vector FP16) extension.
|
|
if (!strcmp(ext, "zvfh")) {
|
|
flag |= kCpuHasRVVZVFH;
|
|
}
|
|
ext = strtok(NULL, "_");
|
|
}
|
|
}
|
|
std_isa_len = isa_len - extensions_len - 5;
|
|
// Detect the v in the standard single-letter extensions.
|
|
if (memchr(isa, 'v', std_isa_len)) {
|
|
// The RVV implied the F extension.
|
|
flag |= kCpuHasRVV;
|
|
}
|
|
}
|
|
}
|
|
#if defined(__riscv_vector)
|
|
// Assume RVV if /proc/cpuinfo is from x86 host running QEMU.
|
|
else if ((memcmp(cpuinfo_line, "vendor_id\t: GenuineIntel", 24) == 0) ||
|
|
(memcmp(cpuinfo_line, "vendor_id\t: AuthenticAMD", 24) == 0)) {
|
|
fclose(f);
|
|
return kCpuHasRVV;
|
|
}
|
|
#endif
|
|
}
|
|
fclose(f);
|
|
return flag;
|
|
}
|
|
|
|
LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
|
|
char cpuinfo_line[512];
|
|
int flag = 0;
|
|
FILE* f = fopen(cpuinfo_name, "re");
|
|
if (!f) {
|
|
// Assume nothing if /proc/cpuinfo is unavailable.
|
|
// This will occur for Chrome sandbox for Pepper or Render process.
|
|
return 0;
|
|
}
|
|
memset(cpuinfo_line, 0, sizeof(cpuinfo_line));
|
|
while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f)) {
|
|
if (memcmp(cpuinfo_line, "cpu model", 9) == 0) {
|
|
// Workaround early kernel without MSA in ASEs line.
|
|
if (strstr(cpuinfo_line, "Loongson-2K")) {
|
|
flag |= kCpuHasMSA;
|
|
}
|
|
}
|
|
if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
|
|
if (strstr(cpuinfo_line, "msa")) {
|
|
flag |= kCpuHasMSA;
|
|
}
|
|
// ASEs is the last line, so we can break here.
|
|
break;
|
|
}
|
|
}
|
|
fclose(f);
|
|
return flag;
|
|
}
|
|
|
|
#define LOONGARCH_CFG2 0x2
|
|
#define LOONGARCH_CFG2_LSX (1 << 6)
|
|
#define LOONGARCH_CFG2_LASX (1 << 7)
|
|
|
|
#if defined(__loongarch__)
|
|
LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
|
|
int flag = 0;
|
|
uint32_t cfg2 = 0;
|
|
|
|
__asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(cfg2) : "r"(LOONGARCH_CFG2));
|
|
|
|
if (cfg2 & LOONGARCH_CFG2_LSX)
|
|
flag |= kCpuHasLSX;
|
|
|
|
if (cfg2 & LOONGARCH_CFG2_LASX)
|
|
flag |= kCpuHasLASX;
|
|
return flag;
|
|
}
|
|
#endif
|
|
|
|
static SAFEBUFFERS int GetCpuFlags(void) {
|
|
int cpu_info = 0;
|
|
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
|
(defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
|
|
defined(_M_IX86))
|
|
int cpu_info0[4] = {0, 0, 0, 0};
|
|
int cpu_info1[4] = {0, 0, 0, 0};
|
|
int cpu_info7[4] = {0, 0, 0, 0};
|
|
int cpu_einfo7[4] = {0, 0, 0, 0};
|
|
CpuId(0, 0, cpu_info0);
|
|
CpuId(1, 0, cpu_info1);
|
|
if (cpu_info0[0] >= 7) {
|
|
CpuId(7, 0, cpu_info7);
|
|
CpuId(7, 1, cpu_einfo7);
|
|
}
|
|
cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
|
|
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
|
|
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
|
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
|
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
|
|
|
|
// AVX requires OS saves YMM registers.
|
|
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
|
|
((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers
|
|
cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
|
|
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
|
|
((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0) |
|
|
((cpu_einfo7[0] & 0x00000010) ? kCpuHasAVXVNNI : 0) |
|
|
((cpu_einfo7[3] & 0x00000010) ? kCpuHasAVXVNNIINT8 : 0);
|
|
|
|
// Detect AVX512bw
|
|
if ((GetXCR0() & 0xe0) == 0xe0) {
|
|
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
|
|
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
|
|
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
|
|
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
|
|
cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0;
|
|
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
|
|
cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
|
|
}
|
|
}
|
|
#endif
|
|
#if defined(__mips__) && defined(__linux__)
|
|
cpu_info = MipsCpuCaps("/proc/cpuinfo");
|
|
cpu_info |= kCpuHasMIPS;
|
|
#endif
|
|
#if defined(__loongarch__) && defined(__linux__)
|
|
cpu_info = LoongarchCpuCaps();
|
|
cpu_info |= kCpuHasLOONGARCH;
|
|
#endif
|
|
#if defined(__arm__) || defined(__aarch64__)
|
|
// gcc -mfpu=neon defines __ARM_NEON__
|
|
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
|
|
// For Linux, /proc/cpuinfo can be tested but without that assume Neon.
|
|
#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)
|
|
cpu_info = kCpuHasNEON;
|
|
// For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
|
|
// flag in it.
|
|
// So for aarch64, neon enabling is hard coded here.
|
|
#endif
|
|
#if defined(__aarch64__)
|
|
cpu_info = kCpuHasNEON;
|
|
#else
|
|
// Linux arm parse text file for neon detect.
|
|
cpu_info = ArmCpuCaps("/proc/cpuinfo");
|
|
#endif
|
|
cpu_info |= kCpuHasARM;
|
|
#endif // __arm__
|
|
#if defined(__riscv) && defined(__linux__)
|
|
cpu_info = RiscvCpuCaps("/proc/cpuinfo");
|
|
cpu_info |= kCpuHasRISCV;
|
|
#endif // __riscv
|
|
cpu_info |= kCpuInitialized;
|
|
return cpu_info;
|
|
}
|
|
|
|
// Note that use of this function is not thread safe.
|
|
LIBYUV_API
|
|
int MaskCpuFlags(int enable_flags) {
|
|
int cpu_info = GetCpuFlags() & enable_flags;
|
|
SetCpuFlags(cpu_info);
|
|
return cpu_info;
|
|
}
|
|
|
|
LIBYUV_API
|
|
int InitCpuFlags(void) {
|
|
return MaskCpuFlags(-1);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|