From fa16ddbb9fbee36d67d8496fc9389ea95930b238 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 2 Jul 2024 10:39:04 -0700 Subject: [PATCH] cpuid show vector length on ARM and RISCV - additional asm volatile changes from github - rotate mips remove C function - moved to common Run on Samsung S22 [ RUN ] LibYUVBaseTest.TestCpuHas Kernel Version 5.10 Has Arm 0x2 Has Neon 0x4 Has Neon DotProd 0x10 Has Neon I8MM 0x20 Has SVE 0x40 Has SVE2 0x80 Has SME 0x0 SVE vector length: 16 bytes [ OK ] LibYUVBaseTest.TestCpuHas (0 ms) [ RUN ] LibYUVBaseTest.TestCompilerMacros __ATOMIC_RELAXED 0 __cplusplus 201703 __clang_major__ 17 __clang_minor__ 0 __GNUC__ 4 __GNUC_MINOR__ 2 __aarch64__ 1 __clang__ 1 __llvm__ 1 __pic__ 2 INT_TYPES_DEFINED __has_feature Run on RISCV qemu emulating SiFive X280: [ RUN ] LibYUVBaseTest.TestCpuHas Kernel Version 6.6 Has RISCV 0x10000000 Has RVV 0x20000000 RVV vector length: 64 bytes [ OK ] LibYUVBaseTest.TestCpuHas (4 ms) [ RUN ] LibYUVBaseTest.TestCompilerMacros __ATOMIC_RELAXED 0 __cplusplus 202002 __clang_major__ 9999 __clang_minor__ 0 __GNUC__ 4 __GNUC_MINOR__ 2 __riscv 1 __riscv_vector 1 __riscv_v_intrinsic 12000 __riscv_zve64x 1000000 __clang__ 1 __llvm__ 1 __pic__ 2 INT_TYPES_DEFINED __has_feature Bug: b/42280943 Change-Id: I53cf0450be4965a28942e113e4c77295ace70999 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5672088 Reviewed-by: David Gao --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/rotate_msa.cc | 10 -- source/row_gcc.cc | 28 ++--- source/scale_gcc.cc | 173 ++++++++++++++++++---------- unit_test/cpu_test.cc | 236 ++++++++++++++++++++++++++++----------- util/cpuid.c | 135 ++++++++++++---------- 7 files changed, 373 insertions(+), 213 deletions(-) diff --git a/README.chromium b/README.chromium index 6412e18ae..1a852d7f8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1889 +Version: 1890 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 001600c90..761bce145 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1889 +#define LIBYUV_VERSION 1890 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc index 99bdca65b..d4e62b12e 100644 --- a/source/rotate_msa.cc +++ b/source/rotate_msa.cc @@ -51,16 +51,6 @@ extern "C" { out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ } -void TransposeWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - TransposeWx8_C(src, src_stride, dst, dst_stride, width); - TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, - width); -} - void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index f8f41860a..782382d0c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1090,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1137,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1186,7 +1186,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1239,7 +1239,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -5428,7 +5428,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -5502,7 +5502,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" @@ -5598,7 +5598,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movq (%2),%%xmm0 \n" // B @@ -5776,7 +5776,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F @@ -6350,7 +6350,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6451,7 +6451,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" @@ -6642,7 +6642,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6741,7 +6741,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, } void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6867,7 +6867,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6970,7 +6970,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, } void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 9dfe64a93..27cdc17aa 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm( + asm volatile ( // 16 pixel loop. LABELALIGN "1: \n" @@ -123,7 +123,8 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -153,7 +154,8 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -193,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile (LABELALIGN + asm volatile ( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -219,7 +221,8 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -251,7 +254,8 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" @@ -293,7 +297,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("pcmpeqb %%xmm5,%%xmm5 \n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" "pslld $0x10,%%xmm5 \n" @@ -323,7 +328,8 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { intptr_t stridex3; - asm("pcmpeqb %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -377,7 +383,8 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n" @@ -409,7 +416,8 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -464,7 +472,8 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("movdqa %0,%%xmm3 \n" + asm volatile ( + "movdqa %0,%%xmm3 \n" "movdqa %1,%%xmm4 \n" "movdqa %2,%%xmm5 \n" : @@ -472,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -499,7 +508,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("movdqa %0,%%xmm2 \n" // kShuf01 + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : @@ -507,7 +517,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm("movdqa %0,%%xmm5 \n" // kMadd01 + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : @@ -515,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -561,7 +572,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("movdqa %0,%%xmm2 \n" // kShuf01 + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : @@ -569,7 +581,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm("movdqa %0,%%xmm5 \n" // kMadd01 + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : @@ -578,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -628,7 +641,8 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm("movdqa %3,%%xmm4 \n" + asm volatile ( + "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN @@ -657,7 +671,8 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("movdqa %0,%%xmm2 \n" + asm volatile ( + "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "movdqa %3,%%xmm5 \n" @@ -667,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" @@ -699,7 +714,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("movdqa %0,%%xmm2 \n" + asm volatile ( + "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -708,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" @@ -766,7 +782,8 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm("pxor %%xmm0,%%xmm0 \n" // 0 + asm volatile ( + "pxor %%xmm0,%%xmm0 \n" // 0 "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $1,%%xmm6 \n" // all 2 @@ -821,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line @@ -934,7 +951,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("movdqa %3,%%xmm5 \n" + asm volatile ( + "movdqa %3,%%xmm5 \n" "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 @@ -985,7 +1003,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("pcmpeqw %%xmm7,%%xmm7 \n" + asm volatile ( + "pcmpeqw %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" "psllw $3,%%xmm7 \n" // all 8 "movdqa %5,%%xmm6 \n" @@ -1082,7 +1101,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("pxor %%xmm5,%%xmm5 \n" + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 @@ -1134,7 +1154,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("pxor %%xmm7,%%xmm7 \n" + asm volatile ( + "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 @@ -1241,7 +1262,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm("pcmpeqw %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" @@ -1281,7 +1303,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("pcmpeqw %%xmm6,%%xmm6 \n" + asm volatile ( + "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" @@ -1365,7 +1388,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" @@ -1408,7 +1432,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + asm volatile ( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" @@ -1489,7 +1514,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("vbroadcastf128 %3,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %3,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1540,7 +1566,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("vbroadcastf128 %5,%%ymm5 \n" + asm volatile ( + "vbroadcastf128 %5,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 @@ -1601,7 +1628,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1650,7 +1678,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + asm volatile ( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 @@ -1732,7 +1761,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm("pxor %%xmm5,%%xmm5 \n" + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN @@ -1763,7 +1793,8 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile ( + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" @@ -1804,7 +1835,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int x, int dx) { intptr_t x0, x1, temp_pixel; - asm("movd %6,%%xmm2 \n" + asm volatile ( + "movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" "movl $0x04040000,%k2 \n" "movd %k2,%%xmm5 \n" @@ -1900,7 +1932,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -1925,7 +1957,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1947,7 +1979,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1971,7 +2003,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2005,7 +2037,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; (void)src_stride; - asm("lea 0x00(,%1,4),%1 \n" + asm volatile ( + "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" LABELALIGN @@ -2041,7 +2074,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm("lea 0x00(,%1,4),%1 \n" + asm volatile ( + "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%0,%5,1),%5 \n" @@ -2083,7 +2117,8 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm("movd %5,%%xmm2 \n" + asm volatile ( + "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x11,%%xmm3,%%xmm0 \n" @@ -2153,7 +2188,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm volatile (LABELALIGN + asm volatile ( "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2191,14 +2226,16 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm("movdqa %0,%%xmm4 \n" + asm volatile ( + "movdqa %0,%%xmm4 \n" "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); - asm("movd %5,%%xmm2 \n" + asm volatile ( + "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" @@ -2260,7 +2297,8 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm("cdq \n" + asm volatile ( + "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "idiv %1 \n" @@ -2273,7 +2311,8 @@ int FixedDiv_X86(int num, int div) { // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm("cdq \n" + asm volatile ( + "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "sub $0x10001,%%eax \n" @@ -2304,7 +2343,8 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 + asm volatile ( + "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5, %%xmm5 \n" // zero @@ -2343,7 +2383,8 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 + asm volatile ( + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero @@ -2386,7 +2427,8 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm("pcmpeqw %%xmm4,%%xmm4 \n" + asm volatile ( + "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" @@ -2426,7 +2468,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("pcmpeqw %%xmm6,%%xmm6 \n" + asm volatile ( + "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" @@ -2509,7 +2552,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" @@ -2551,7 +2595,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + asm volatile ( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" @@ -2630,7 +2675,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("pxor %%xmm5,%%xmm5 \n" + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 @@ -2681,7 +2727,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("pxor %%xmm7,%%xmm7 \n" + asm volatile ( + "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 @@ -2771,7 +2818,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" + asm volatile ( + "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2819,7 +2867,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" + asm volatile ( + "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 325249364..b551ddd52 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -11,6 +11,11 @@ #include #include +#ifdef __linux__ +#include +#include +#endif + #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" @@ -18,80 +23,179 @@ namespace libyuv { +#ifdef __linux__ +static void KernelVersion(int *version) { + struct utsname buffer; + int i = 0; + + version[0] = version[1] = 0; + if (uname(&buffer) == 0) { + char *v = buffer.release; + for (i = 0; *v && i < 2; ++v) { + if (isdigit(*v)) { + version[i++] = (int) strtol(v, &v, 10); + } + } + } +} +#endif + TEST_F(LibYUVBaseTest, TestCpuHas) { - int cpu_flags = TestCpuFlag(-1); - printf("Cpu Flags 0x%x\n", cpu_flags); +#if defined(__linux__) + { + int kernelversion[2]; + KernelVersion(kernelversion); + printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]); + } +#endif // defined(__linux__) + #if defined(__arm__) || defined(__aarch64__) int has_arm = TestCpuFlag(kCpuHasARM); - printf("Has ARM 0x%x\n", has_arm); - int has_neon = TestCpuFlag(kCpuHasNEON); - printf("Has NEON 0x%x\n", has_neon); -#endif -#if defined(__riscv) && defined(__linux__) + if (has_arm) { + int has_neon = TestCpuFlag(kCpuHasNEON); + int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd); + int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM); + int has_sve = TestCpuFlag(kCpuHasSVE); + int has_sve2 = TestCpuFlag(kCpuHasSVE2); + int has_sme = TestCpuFlag(kCpuHasSME); + printf("Has Arm 0x%x\n", has_arm); + printf("Has Neon 0x%x\n", has_neon); + printf("Has Neon DotProd 0x%x\n", has_neon_dotprod); + printf("Has Neon I8MM 0x%x\n", has_neon_i8mm); + printf("Has SVE 0x%x\n", has_sve); + printf("Has SVE2 0x%x\n", has_sve2); + printf("Has SME 0x%x\n", has_sme); + +#if defined(__aarch64__) + // Read and print the SVE and SME vector lengths. + if (has_sve) { + int sve_vl; + // rdvl x0, #1 + asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0"); + printf("SVE vector length: %d bytes\n", sve_vl); + } + if (has_sme) { + int sme_vl; + // rdsvl x0, #1 + asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0"); + printf("SME vector length: %d bytes\n", sme_vl); + } +#endif // defined(__aarch64__) + } +#endif // if defined(__arm__) || defined(__aarch64__) + +#if defined(__riscv) int has_riscv = TestCpuFlag(kCpuHasRISCV); - printf("Has RISCV 0x%x\n", has_riscv); - int has_rvv = TestCpuFlag(kCpuHasRVV); - printf("Has RVV 0x%x\n", has_rvv); - int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH); - printf("Has RVVZVFH 0x%x\n", has_rvvzvfh); -#endif -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ - defined(_M_X64) - int has_x86 = TestCpuFlag(kCpuHasX86); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - int has_sse41 = TestCpuFlag(kCpuHasSSE41); - int has_sse42 = TestCpuFlag(kCpuHasSSE42); - int has_avx = TestCpuFlag(kCpuHasAVX); - int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_erms = TestCpuFlag(kCpuHasERMS); - int has_fma3 = TestCpuFlag(kCpuHasFMA3); - int has_f16c = TestCpuFlag(kCpuHasF16C); - int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); - int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); - int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI); - int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); - int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); - int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); - int has_avx10 = TestCpuFlag(kCpuHasAVX10); - int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI); - int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8); - int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8); - printf("Has X86 0x%x\n", has_x86); - printf("Has SSE2 0x%x\n", has_sse2); - printf("Has SSSE3 0x%x\n", has_ssse3); - printf("Has SSE41 0x%x\n", has_sse41); - printf("Has SSE42 0x%x\n", has_sse42); - printf("Has AVX 0x%x\n", has_avx); - printf("Has AVX2 0x%x\n", has_avx2); - printf("Has ERMS 0x%x\n", has_erms); - printf("Has FMA3 0x%x\n", has_fma3); - printf("Has F16C 0x%x\n", has_f16c); - printf("Has AVX512BW 0x%x\n", has_avx512bw); - printf("Has AVX512VL 0x%x\n", has_avx512vl); - printf("Has AVX512VNNI 0x%x\n", has_avx512vnni); - printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi); - printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2); - printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg); - printf("Has AVX10 0x%x\n", has_avx10); - printf("HAS AVXVNNI 0x%x\n", has_avxvnni); - printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); - printf("Has AMXINT8 0x%x\n", has_amxint8); -#endif + if (has_riscv) { + int has_rvv = TestCpuFlag(kCpuHasRVV); + printf("Has RISCV 0x%x\n", has_riscv); + printf("Has RVV 0x%x\n", has_rvv); + + // Read and print the RVV vector length. + if (has_rvv) { + register uint32_t vlenb __asm__ ("t0"); + __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); + printf("RVV vector length: %d bytes\n", vlenb); + } + } +#endif // defined(__riscv) + #if defined(__mips__) int has_mips = TestCpuFlag(kCpuHasMIPS); - printf("Has MIPS 0x%x\n", has_mips); - int has_msa = TestCpuFlag(kCpuHasMSA); - printf("Has MSA 0x%x\n", has_msa); -#endif + if (has_mips) { + int has_msa = TestCpuFlag(kCpuHasMSA); + printf("Has MIPS 0x%x\n", has_mips); + printf("Has MSA 0x%x\n", has_msa); + } +#endif // defined(__mips__) + #if defined(__loongarch__) int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); - printf("Has LOONGARCH 0x%x\n", has_loongarch); - int has_lsx = TestCpuFlag(kCpuHasLSX); - printf("Has LSX 0x%x\n", has_lsx); - int has_lasx = TestCpuFlag(kCpuHasLASX); - printf("Has LASX 0x%x\n", has_lasx); -#endif + if (has_loongarch) { + int has_lsx = TestCpuFlag(kCpuHasLSX); + int has_lasx = TestCpuFlag(kCpuHasLASX); + printf("Has LOONGARCH 0x%x\n", has_loongarch); + printf("Has LSX 0x%x\n", has_lsx); + printf("Has LASX 0x%x\n", has_lasx); + } +#endif // defined(__loongarch__) + +#if defined(__i386__) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_X64) + int has_x86 = TestCpuFlag(kCpuHasX86); + if (has_x86) { + int family, model, cpu_info[4]; + // Vendor ID: + // AuthenticAMD AMD processor + // CentaurHauls Centaur processor + // CyrixInstead Cyrix processor + // GenuineIntel Intel processor + // GenuineTMx86 Transmeta processor + // Geode by NSC National Semiconductor processor + // NexGenDriven NexGen processor + // RiseRiseRise Rise Technology processor + // SiS SiS SiS SiS processor + // UMC UMC UMC UMC processor + CpuId(0, 0, &cpu_info[0]); + cpu_info[0] = cpu_info[1]; // Reorder output + cpu_info[1] = cpu_info[3]; + cpu_info[3] = 0; + printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0])); + + // CPU Family and Model + // 3:0 - Stepping + // 7:4 - Model + // 11:8 - Family + // 13:12 - Processor Type + // 19:16 - Extended Model + // 27:20 - Extended Family + CpuId(1, 0, &cpu_info[0]); + family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); + model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); + printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, + model, model); + + int has_sse2 = TestCpuFlag(kCpuHasSSE2); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + int has_sse41 = TestCpuFlag(kCpuHasSSE41); + int has_sse42 = TestCpuFlag(kCpuHasSSE42); + int has_avx = TestCpuFlag(kCpuHasAVX); + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + int has_erms = TestCpuFlag(kCpuHasERMS); + int has_fma3 = TestCpuFlag(kCpuHasFMA3); + int has_f16c = TestCpuFlag(kCpuHasF16C); + int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); + int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); + int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI); + int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); + int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); + int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); + int has_avx10 = TestCpuFlag(kCpuHasAVX10); + int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI); + int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8); + int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8); + printf("Has X86 0x%x\n", has_x86); + printf("Has SSE2 0x%x\n", has_sse2); + printf("Has SSSE3 0x%x\n", has_ssse3); + printf("Has SSE4.1 0x%x\n", has_sse41); + printf("Has SSE4.2 0x%x\n", has_sse42); + printf("Has AVX 0x%x\n", has_avx); + printf("Has AVX2 0x%x\n", has_avx2); + printf("Has ERMS 0x%x\n", has_erms); + printf("Has FMA3 0x%x\n", has_fma3); + printf("Has F16C 0x%x\n", has_f16c); + printf("Has AVX512BW 0x%x\n", has_avx512bw); + printf("Has AVX512VL 0x%x\n", has_avx512vl); + printf("Has AVX512VNNI 0x%x\n", has_avx512vnni); + printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi); + printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2); + printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg); + printf("Has AVX10 0x%x\n", has_avx10); + printf("HAS AVXVNNI 0x%x\n", has_avxvnni); + printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); + printf("Has AMXINT8 0x%x\n", has_amxint8); + } +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) } TEST_F(LibYUVBaseTest, TestCompilerMacros) { diff --git a/util/cpuid.c b/util/cpuid.c index 0f1f806a8..725bc928b 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -41,24 +41,91 @@ static void KernelVersion(int *version) { #endif int main(int argc, const char* argv[]) { - int cpu_flags = TestCpuFlag(-1); - int has_arm = TestCpuFlag(kCpuHasARM); - int has_riscv = TestCpuFlag(kCpuHasRISCV); - int has_x86 = TestCpuFlag(kCpuHasX86); - int has_mips = TestCpuFlag(kCpuHasMIPS); - int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); (void)argc; (void)argv; -#ifdef __linux__ +#if defined(__linux__) { int kernelversion[2]; KernelVersion(kernelversion); printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]); } -#endif +#endif // defined(__linux__) + +#if defined(__arm__) || defined(__aarch64__) + int has_arm = TestCpuFlag(kCpuHasARM); + if (has_arm) { + int has_neon = TestCpuFlag(kCpuHasNEON); + int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd); + int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM); + int has_sve = TestCpuFlag(kCpuHasSVE); + int has_sve2 = TestCpuFlag(kCpuHasSVE2); + int has_sme = TestCpuFlag(kCpuHasSME); + printf("Has Arm 0x%x\n", has_arm); + printf("Has Neon 0x%x\n", has_neon); + printf("Has Neon DotProd 0x%x\n", has_neon_dotprod); + printf("Has Neon I8MM 0x%x\n", has_neon_i8mm); + printf("Has SVE 0x%x\n", has_sve); + printf("Has SVE2 0x%x\n", has_sve2); + printf("Has SME 0x%x\n", has_sme); + +#if defined(__aarch64__) + // Read and print the SVE and SME vector lengths. + if (has_sve) { + int sve_vl; + // rdvl x0, #1 + asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0"); + printf("SVE vector length: %d bytes\n", sve_vl); + } + if (has_sme) { + int sme_vl; + // rdsvl x0, #1 + asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0"); + printf("SME vector length: %d bytes\n", sme_vl); + } +#endif // defined(__aarch64__) + } +#endif // if defined(__arm__) || defined(__aarch64__) + +#if defined(__riscv) + int has_riscv = TestCpuFlag(kCpuHasRISCV); + if (has_riscv) { + int has_rvv = TestCpuFlag(kCpuHasRVV); + printf("Has RISCV 0x%x\n", has_riscv); + printf("Has RVV 0x%x\n", has_rvv); + + // Read and print the RVV vector length. + if (has_rvv) { + register uint32_t vlenb __asm__ ("t0"); + __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); + printf("RVV vector length: %d bytes\n", vlenb); + } + } +#endif // defined(__riscv) + +#if defined(__mips__) + int has_mips = TestCpuFlag(kCpuHasMIPS); + if (has_mips) { + int has_msa = TestCpuFlag(kCpuHasMSA); + printf("Has MIPS 0x%x\n", has_mips); + printf("Has MSA 0x%x\n", has_msa); + } +#endif // defined(__mips__) + +#if defined(__loongarch__) + int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); + if (has_loongarch) { + int has_lsx = TestCpuFlag(kCpuHasLSX); + int has_lasx = TestCpuFlag(kCpuHasLASX); + printf("Has LOONGARCH 0x%x\n", has_loongarch); + printf("Has LSX 0x%x\n", has_lsx); + printf("Has LASX 0x%x\n", has_lasx); + } +#endif // defined(__loongarch__) + #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) + int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { int family, model, cpu_info[4]; // Vendor ID: @@ -90,58 +157,7 @@ int main(int argc, const char* argv[]) { model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, model); - } -#endif - printf("Cpu Flags 0x%x\n", cpu_flags); - if (has_arm) { - int has_neon = TestCpuFlag(kCpuHasNEON); - int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd); - int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM); - int has_sve = TestCpuFlag(kCpuHasSVE); - int has_sve2 = TestCpuFlag(kCpuHasSVE2); - int has_sme = TestCpuFlag(kCpuHasSME); - printf("Has Arm 0x%x\n", has_arm); - printf("Has Neon 0x%x\n", has_neon); - printf("Has Neon DotProd 0x%x\n", has_neon_dotprod); - printf("Has Neon I8MM 0x%x\n", has_neon_i8mm); - printf("Has SVE 0x%x\n", has_sve); - printf("Has SVE2 0x%x\n", has_sve2); - printf("Has SME 0x%x\n", has_sme); -#if __aarch64__ - // Read and print the SVE and SME vector lengths. - if (has_sve) { - int sve_vl; - // rdvl x0, #1 - asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0"); - printf("SVE vector length: %d bytes\n", sve_vl); - } - if (has_sme) { - int sme_vl; - // rdsvl x0, #1 - asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0"); - printf("SME vector length: %d bytes\n", sme_vl); - } -#endif - } - if (has_riscv) { - int has_rvv = TestCpuFlag(kCpuHasRVV); - printf("Has RISCV 0x%x\n", has_riscv); - printf("Has RVV 0x%x\n", has_rvv); - } - if (has_mips) { - int has_msa = TestCpuFlag(kCpuHasMSA); - printf("Has MIPS 0x%x\n", has_mips); - printf("Has MSA 0x%x\n", has_msa); - } - if (has_loongarch) { - int has_lsx = TestCpuFlag(kCpuHasLSX); - int has_lasx = TestCpuFlag(kCpuHasLASX); - printf("Has LOONGARCH 0x%x\n", has_loongarch); - printf("Has LSX 0x%x\n", has_lsx); - printf("Has LASX 0x%x\n", has_lasx); - } - if (has_x86) { int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse41 = TestCpuFlag(kCpuHasSSE41); @@ -182,6 +198,7 @@ int main(int argc, const char* argv[]) { printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); printf("Has AMXINT8 0x%x\n", has_amxint8); } +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) return 0; }