From fa16ddbb9fbee36d67d8496fc9389ea95930b238 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 2 Jul 2024 10:39:04 -0700
Subject: [PATCH] cpuid show vector length on ARM and RISCV

- additional asm volatile changes from github
- rotate mips remove C function - moved to common

Run on Samsung S22
[ RUN      ] LibYUVBaseTest.TestCpuHas
Kernel Version 5.10
Has Arm 0x2
Has Neon 0x4
Has Neon DotProd 0x10
Has Neon I8MM 0x20
Has SVE 0x40
Has SVE2 0x80
Has SME 0x0
SVE vector length: 16 bytes
[       OK ] LibYUVBaseTest.TestCpuHas (0 ms)
[ RUN      ] LibYUVBaseTest.TestCompilerMacros
__ATOMIC_RELAXED 0
__cplusplus 201703
__clang_major__ 17
__clang_minor__ 0
__GNUC__ 4
__GNUC_MINOR__ 2
__aarch64__ 1
__clang__ 1
__llvm__ 1
__pic__ 2
INT_TYPES_DEFINED
__has_feature

Run on RISCV qemu emulating SiFive X280:
[ RUN      ] LibYUVBaseTest.TestCpuHas
Kernel Version 6.6
Has RISCV 0x10000000
Has RVV 0x20000000
RVV vector length: 64 bytes
[       OK ] LibYUVBaseTest.TestCpuHas (4 ms)
[ RUN      ] LibYUVBaseTest.TestCompilerMacros
__ATOMIC_RELAXED 0
__cplusplus 202002
__clang_major__ 9999
__clang_minor__ 0
__GNUC__ 4
__GNUC_MINOR__ 2
__riscv 1
__riscv_vector 1
__riscv_v_intrinsic 12000
__riscv_zve64x 1000000
__clang__ 1
__llvm__ 1
__pic__ 2
INT_TYPES_DEFINED
__has_feature

Bug: b/42280943
Change-Id: I53cf0450be4965a28942e113e4c77295ace70999
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5672088
Reviewed-by: David Gao <davidgao@google.com>
---
 README.chromium          |   2 +-
 include/libyuv/version.h |   2 +-
 source/rotate_msa.cc     |  10 --
 source/row_gcc.cc        |  28 ++---
 source/scale_gcc.cc      | 173 ++++++++++++++++++----------
 unit_test/cpu_test.cc    | 236 ++++++++++++++++++++++++++++-----------
 util/cpuid.c             | 135 ++++++++++++----------
 7 files changed, 373 insertions(+), 213 deletions(-)

diff --git a/README.chromium b/README.chromium
index 6412e18ae..1a852d7f8 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1889
+Version: 1890
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 001600c90..761bce145 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1889
+#define LIBYUV_VERSION 1890
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc
index 99bdca65b..d4e62b12e 100644
--- a/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
@@ -51,16 +51,6 @@ extern "C" {
     out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
   }
 
-void TransposeWx16_C(const uint8_t* src,
-                     int src_stride,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width) {
-  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
-  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
-                 width);
-}
-
 void TransposeUVWx16_C(const uint8_t* src,
                        int src_stride,
                        uint8_t* dst_a,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index f8f41860a..782382d0c 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -1090,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                          uint16_t* dst_ar64,
                          int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqa      %%xmm0,%%xmm1                 \n"
@@ -1137,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                          uint8_t* dst_argb,
                          int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1186,7 +1186,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@@ -1239,7 +1239,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -5428,7 +5428,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -5502,7 +5502,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_rgb,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      (%1),%%xmm1                   \n"
@@ -5598,7 +5598,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                        const uint8_t* src_b,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
 
       "movq        (%2),%%xmm0                   \n"  // B
@@ -5776,7 +5776,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                        uint8_t* dst_g,
                        uint8_t* dst_b,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
 
       "movdqu      (%0),%%xmm0                   \n"  // 00-0F
@@ -6350,7 +6350,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
 
 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6451,7 +6451,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                               uint8_t* dst_a,
                               int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0), %%xmm0                  \n"
       "movdqu      0x10(%0), %%xmm1              \n"
@@ -6642,7 +6642,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6741,7 +6741,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -6867,7 +6867,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                         int stride_yuy2,
                         uint8_t* dst_uv,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -6970,7 +6970,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }
 
 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc
index 9dfe64a93..27cdc17aa 100644
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-  asm(
+  asm volatile (
       // 16 pixel loop.
       LABELALIGN
       "1:                                        \n"
@@ -123,7 +123,8 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
   (void)src_stride;
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -153,7 +154,8 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -193,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "vmovdqu     (%0),%%ymm0                   \n"
       "vmovdqu     0x20(%0),%%ymm1               \n"
@@ -219,7 +221,8 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
   (void)src_stride;
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -251,7 +254,8 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@@ -293,7 +297,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
   (void)src_stride;
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
       "psrld       $0x18,%%xmm5                  \n"
       "pslld       $0x10,%%xmm5                  \n"
 
@@ -323,7 +328,8 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
   intptr_t stridex3;
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
       "psrlw       $0xf,%%xmm4                   \n"
       "movdqa      %%xmm4,%%xmm5                 \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
@@ -377,7 +383,8 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
   (void)src_stride;
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
       "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
       "vpslld      $0x10,%%ymm5,%%ymm5           \n"
 
@@ -409,7 +416,8 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@@ -464,7 +472,8 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-      asm("movdqa      %0,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm3                     \n"
       "movdqa      %1,%%xmm4                     \n"
       "movdqa      %2,%%xmm5                     \n"
       :
@@ -472,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
   );
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm2               \n"
@@ -499,7 +508,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
@@ -507,7 +517,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
+  asm volatile (
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
@@ -515,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
   );
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -561,7 +572,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
       "movdqa      %1,%%xmm3                     \n"  // kShuf11
       "movdqa      %2,%%xmm4                     \n"  // kShuf21
       :
@@ -569,7 +581,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
   );
-      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
+  asm volatile (
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
       "movdqa      %1,%%xmm0                     \n"  // kMadd11
       "movdqa      %2,%%xmm1                     \n"  // kRound34
       :
@@ -578,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kRound34)  // %2
   );
 
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm6                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@@ -628,7 +641,8 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                           uint8_t* dst_ptr,
                           int dst_width) {
   (void)src_stride;
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
       "movdqa      %4,%%xmm5                     \n"
 
       LABELALIGN
@@ -657,7 +671,8 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
       "movdqa      %3,%%xmm5                     \n"
@@ -667,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
   );
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@@ -699,7 +714,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"
       "movdqa      %1,%%xmm3                     \n"
       "movdqa      %2,%%xmm4                     \n"
       "pxor        %%xmm5,%%xmm5                 \n"
@@ -708,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
   );
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@@ -766,7 +782,8 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-      asm("pxor        %%xmm0,%%xmm0                 \n"  // 0
+  asm volatile (
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
       "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $1,%%xmm6                     \n"  // all 2
@@ -821,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "pxor        %%xmm0,%%xmm0                 \n"  // 0
       // above line
@@ -934,7 +951,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
-      asm("movdqa      %3,%%xmm5                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm5                     \n"
       "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
@@ -985,7 +1003,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width) {
-      asm("pcmpeqw     %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm7,%%xmm7                 \n"
       "psrlw       $15,%%xmm7                    \n"
       "psllw       $3,%%xmm7                     \n"  // all 8
       "movdqa      %5,%%xmm6                     \n"
@@ -1082,7 +1101,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
       "pslld       $1,%%xmm4                     \n"  // all 2
@@ -1134,7 +1154,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-      asm("pxor        %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
       "pslld       $3,%%xmm6                     \n"  // all 8
@@ -1241,7 +1262,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
-      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
       "movdqa      %3,%%xmm3                     \n"
@@ -1281,7 +1303,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
-      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
       "movdqa      %5,%%xmm7                     \n"
@@ -1365,7 +1388,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
       "vbroadcastf128 %3,%%ymm3                  \n"
@@ -1408,7 +1432,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -1489,7 +1514,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-      asm("vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@@ -1540,7 +1566,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
       "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
@@ -1601,7 +1628,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
 
@@ -1650,7 +1678,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
 
@@ -1732,7 +1761,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
 
       // 16 pixel loop.
       LABELALIGN
@@ -1763,7 +1793,8 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                       uint16_t* dst_ptr,
                       int src_width) {
-      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
 
       LABELALIGN
       "1:                                        \n"
@@ -1804,7 +1835,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                            int x,
                            int dx) {
   intptr_t x0, x1, temp_pixel;
-      asm("movd        %6,%%xmm2                     \n"
+  asm volatile (
+      "movd        %6,%%xmm2                     \n"
       "movd        %7,%%xmm3                     \n"
       "movl        $0x04040000,%k2               \n"
       "movd        %k2,%%xmm5                    \n"
@@ -1900,7 +1932,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                        int dx) {
   (void)x;
   (void)dx;
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -1925,7 +1957,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             uint8_t* dst_argb,
                             int dst_width) {
   (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1947,7 +1979,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   uint8_t* dst_argb,
                                   int dst_width) {
   (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -1971,7 +2003,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                uint8_t* dst_argb,
                                int dst_width) {
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%0),%%xmm0                   \n"
       "movdqu      0x10(%0),%%xmm1               \n"
@@ -2005,7 +2037,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   (void)src_stride;
-      asm("lea         0x00(,%1,4),%1                \n"
+  asm volatile (
+      "lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
 
       LABELALIGN
@@ -2041,7 +2074,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-      asm("lea         0x00(,%1,4),%1                \n"
+  asm volatile (
+      "lea         0x00(,%1,4),%1                \n"
       "lea         0x00(%1,%1,2),%4              \n"
       "lea         0x00(%0,%5,1),%5              \n"
 
@@ -2083,7 +2117,8 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                         int x,
                         int dx) {
   intptr_t x0, x1;
-      asm("movd        %5,%%xmm2                     \n"
+  asm volatile (
+      "movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pshufd      $0x0,%%xmm2,%%xmm2            \n"
       "pshufd      $0x11,%%xmm3,%%xmm0           \n"
@@ -2153,7 +2188,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                            int dx) {
   (void)x;
   (void)dx;
-  asm volatile (LABELALIGN
+  asm volatile (
       "1:                                        \n"
       "movdqu      (%1),%%xmm0                   \n"
       "lea         0x10(%1),%1                   \n"
@@ -2191,14 +2226,16 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                                int x,
                                int dx) {
   intptr_t x0, x1;
-      asm("movdqa      %0,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm4                     \n"
       "movdqa      %1,%%xmm5                     \n"
       :
       : "m"(kShuffleColARGB),   // %0
         "m"(kShuffleFractions)  // %1
   );
 
-      asm("movd        %5,%%xmm2                     \n"
+  asm volatile (
+      "movd        %5,%%xmm2                     \n"
       "movd        %6,%%xmm3                     \n"
       "pcmpeqb     %%xmm6,%%xmm6                 \n"
       "psrlw       $0x9,%%xmm6                   \n"
@@ -2260,7 +2297,8 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-      asm("cdq                                       \n"
+  asm volatile (
+      "cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
       "idiv        %1                            \n"
@@ -2273,7 +2311,8 @@ int FixedDiv_X86(int num, int div) {
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-      asm("cdq                                       \n"
+  asm volatile (
+      "cdq                                       \n"
       "shld        $0x10,%%eax,%%edx             \n"
       "shl         $0x10,%%eax                   \n"
       "sub         $0x10001,%%eax                \n"
@@ -2304,7 +2343,8 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                               ptrdiff_t src_stride,
                               uint8_t* dst_ptr,
                               int dst_width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
       "psrlw       $0xf,%%xmm4                   \n"
       "packuswb    %%xmm4,%%xmm4                 \n"
       "pxor        %%xmm5, %%xmm5                \n"  // zero
@@ -2343,7 +2383,8 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
                              int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
       "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
       "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
       "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
@@ -2386,7 +2427,8 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 int dst_width) {
-      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
       "psrlw       $15,%%xmm4                    \n"
       "psllw       $1,%%xmm4                     \n"  // all 2
       "movdqa      %3,%%xmm3                     \n"
@@ -2426,7 +2468,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                   uint8_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
       "psrlw       $15,%%xmm6                    \n"
       "psllw       $3,%%xmm6                     \n"  // all 8
       "movdqa      %5,%%xmm7                     \n"
@@ -2509,7 +2552,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrlw      $15,%%ymm4,%%ymm4             \n"
       "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
       "vbroadcastf128 %3,%%ymm3                  \n"
@@ -2551,7 +2595,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrlw      $15,%%ymm6,%%ymm6             \n"
       "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
       "vbroadcastf128 %5,%%ymm7                  \n"
@@ -2630,7 +2675,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    int dst_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
       "pcmpeqd     %%xmm4,%%xmm4                 \n"
       "psrld       $31,%%xmm4                    \n"
       "pslld       $1,%%xmm4                     \n"  // all 2
@@ -2681,7 +2727,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
                                      uint16_t* dst_ptr,
                                      ptrdiff_t dst_stride,
                                      int dst_width) {
-      asm("pxor        %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pxor        %%xmm7,%%xmm7                 \n"
       "pcmpeqd     %%xmm6,%%xmm6                 \n"
       "psrld       $31,%%xmm6                    \n"
       "pslld       $3,%%xmm6                     \n"  // all 8
@@ -2771,7 +2818,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
-      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
       "vpsrld      $31,%%ymm4,%%ymm4             \n"
       "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2
 
@@ -2819,7 +2867,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
-      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
       "vpsrld      $31,%%ymm6,%%ymm6             \n"
       "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8
 
diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc
index 325249364..b551ddd52 100644
--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@@ -11,6 +11,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __linux__
+#include <ctype.h>
+#include <sys/utsname.h>
+#endif
+
 #include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
@@ -18,80 +23,179 @@
 
 namespace libyuv {
 
+#ifdef __linux__
+static void KernelVersion(int *version) {
+  struct utsname buffer;
+  int i = 0;
+
+  version[0] = version[1] = 0;
+  if (uname(&buffer) == 0) {
+    char *v = buffer.release;
+    for (i = 0; *v && i < 2; ++v) {
+      if (isdigit(*v)) {
+        version[i++] = (int) strtol(v, &v, 10);
+      }
+    }
+  }
+}
+#endif
+
 TEST_F(LibYUVBaseTest, TestCpuHas) {
-  int cpu_flags = TestCpuFlag(-1);
-  printf("Cpu Flags 0x%x\n", cpu_flags);
+#if defined(__linux__)
+  {
+    int kernelversion[2];
+    KernelVersion(kernelversion);
+    printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
+  }
+#endif  // defined(__linux__)
+
 #if defined(__arm__) || defined(__aarch64__)
   int has_arm = TestCpuFlag(kCpuHasARM);
-  printf("Has ARM 0x%x\n", has_arm);
-  int has_neon = TestCpuFlag(kCpuHasNEON);
-  printf("Has NEON 0x%x\n", has_neon);
-#endif
-#if defined(__riscv) && defined(__linux__)
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
+    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
+    int has_sve = TestCpuFlag(kCpuHasSVE);
+    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
+    int has_sme = TestCpuFlag(kCpuHasSME);
+    printf("Has Arm 0x%x\n", has_arm);
+    printf("Has Neon 0x%x\n", has_neon);
+    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
+    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
+    printf("Has SVE 0x%x\n", has_sve);
+    printf("Has SVE2 0x%x\n", has_sve2);
+    printf("Has SME 0x%x\n", has_sme);
+
+#if defined(__aarch64__)
+    // Read and print the SVE and SME vector lengths.
+    if (has_sve) {
+      int sve_vl;
+      // rdvl x0, #1
+      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
+      printf("SVE vector length: %d bytes\n", sve_vl);
+    }
+    if (has_sme) {
+      int sme_vl;
+      // rdsvl x0, #1
+      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
+      printf("SME vector length: %d bytes\n", sme_vl);
+    }
+#endif  // defined(__aarch64__)
+  }
+#endif  // if defined(__arm__) || defined(__aarch64__)
+
+#if defined(__riscv)
   int has_riscv = TestCpuFlag(kCpuHasRISCV);
-  printf("Has RISCV 0x%x\n", has_riscv);
-  int has_rvv = TestCpuFlag(kCpuHasRVV);
-  printf("Has RVV 0x%x\n", has_rvv);
-  int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
-  printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
-#endif
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
-    defined(_M_X64)
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
-  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
-  int has_avx = TestCpuFlag(kCpuHasAVX);
-  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-  int has_erms = TestCpuFlag(kCpuHasERMS);
-  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-  int has_f16c = TestCpuFlag(kCpuHasF16C);
-  int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
-  int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
-  int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
-  int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
-  int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
-  int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-  int has_avx10 = TestCpuFlag(kCpuHasAVX10);
-  int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
-  int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
-  int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
-  printf("Has X86 0x%x\n", has_x86);
-  printf("Has SSE2 0x%x\n", has_sse2);
-  printf("Has SSSE3 0x%x\n", has_ssse3);
-  printf("Has SSE41 0x%x\n", has_sse41);
-  printf("Has SSE42 0x%x\n", has_sse42);
-  printf("Has AVX 0x%x\n", has_avx);
-  printf("Has AVX2 0x%x\n", has_avx2);
-  printf("Has ERMS 0x%x\n", has_erms);
-  printf("Has FMA3 0x%x\n", has_fma3);
-  printf("Has F16C 0x%x\n", has_f16c);
-  printf("Has AVX512BW 0x%x\n", has_avx512bw);
-  printf("Has AVX512VL 0x%x\n", has_avx512vl);
-  printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
-  printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
-  printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
-  printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
-  printf("Has AVX10 0x%x\n", has_avx10);
-  printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
-  printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
-  printf("Has AMXINT8 0x%x\n", has_amxint8);
-#endif
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
+
+    // Read and print the RVV vector length.
+    if (has_rvv) {
+      register uint32_t vlenb __asm__ ("t0");
+      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      printf("RVV vector length: %d bytes\n", vlenb);
+    }
+  }
+#endif  // defined(__riscv)
+
 #if defined(__mips__)
   int has_mips = TestCpuFlag(kCpuHasMIPS);
-  printf("Has MIPS 0x%x\n", has_mips);
-  int has_msa = TestCpuFlag(kCpuHasMSA);
-  printf("Has MSA 0x%x\n", has_msa);
-#endif
+  if (has_mips) {
+    int has_msa = TestCpuFlag(kCpuHasMSA);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+#endif  // defined(__mips__)
+
 #if defined(__loongarch__)
   int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
-  printf("Has LOONGARCH 0x%x\n", has_loongarch);
-  int has_lsx = TestCpuFlag(kCpuHasLSX);
-  printf("Has LSX 0x%x\n", has_lsx);
-  int has_lasx = TestCpuFlag(kCpuHasLASX);
-  printf("Has LASX 0x%x\n", has_lasx);
-#endif
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
+  }
+#endif  // defined(__loongarch__)
+
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  if (has_x86) {
+    int family, model, cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, &cpu_info[0]);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, &cpu_info[0]);
+    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+
+    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+    int has_avx = TestCpuFlag(kCpuHasAVX);
+    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+    int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+    int has_f16c = TestCpuFlag(kCpuHasF16C);
+    int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+    int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+    int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
+    int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+    int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+    int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+    int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+    int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX10 0x%x\n", has_avx10);
+    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
+    printf("Has AMXINT8 0x%x\n", has_amxint8);
+  }
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
 }
 
 TEST_F(LibYUVBaseTest, TestCompilerMacros) {
diff --git a/util/cpuid.c b/util/cpuid.c
index 0f1f806a8..725bc928b 100644
--- a/util/cpuid.c
+++ b/util/cpuid.c
@@ -41,24 +41,91 @@ static void KernelVersion(int *version) {
 #endif
 
 int main(int argc, const char* argv[]) {
-  int cpu_flags = TestCpuFlag(-1);
-  int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_riscv = TestCpuFlag(kCpuHasRISCV);
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
-  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
   (void)argc;
   (void)argv;
 
-#ifdef __linux__
+#if defined(__linux__)
   {
     int kernelversion[2];
     KernelVersion(kernelversion);
     printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
   }
-#endif
+#endif  // defined(__linux__)
+
+#if defined(__arm__) || defined(__aarch64__)
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
+    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
+    int has_sve = TestCpuFlag(kCpuHasSVE);
+    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
+    int has_sme = TestCpuFlag(kCpuHasSME);
+    printf("Has Arm 0x%x\n", has_arm);
+    printf("Has Neon 0x%x\n", has_neon);
+    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
+    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
+    printf("Has SVE 0x%x\n", has_sve);
+    printf("Has SVE2 0x%x\n", has_sve2);
+    printf("Has SME 0x%x\n", has_sme);
+
+#if defined(__aarch64__)
+    // Read and print the SVE and SME vector lengths.
+    if (has_sve) {
+      int sve_vl;
+      // rdvl x0, #1
+      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
+      printf("SVE vector length: %d bytes\n", sve_vl);
+    }
+    if (has_sme) {
+      int sme_vl;
+      // rdsvl x0, #1
+      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
+      printf("SME vector length: %d bytes\n", sme_vl);
+    }
+#endif  // defined(__aarch64__)
+  }
+#endif  // if defined(__arm__) || defined(__aarch64__)
+
+#if defined(__riscv)
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
+
+    // Read and print the RVV vector length.
+    if (has_rvv) {
+      register uint32_t vlenb __asm__ ("t0");
+      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      printf("RVV vector length: %d bytes\n", vlenb);
+    }
+  }
+#endif  // defined(__riscv)
+
+#if defined(__mips__)
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  if (has_mips) {
+    int has_msa = TestCpuFlag(kCpuHasMSA);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+#endif  // defined(__mips__)
+
+#if defined(__loongarch__)
+  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
+  }
+#endif  // defined(__loongarch__)
+
 #if defined(__i386__) || defined(__x86_64__) || \
     defined(_M_IX86) || defined(_M_X64)
+  int has_x86 = TestCpuFlag(kCpuHasX86);
   if (has_x86) {
     int family, model, cpu_info[4];
     // Vendor ID:
@@ -90,58 +157,7 @@ int main(int argc, const char* argv[]) {
     model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
     printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
            model, model);
-  }
-#endif
-  printf("Cpu Flags 0x%x\n", cpu_flags);
-  if (has_arm) {
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
-    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
-    int has_sve = TestCpuFlag(kCpuHasSVE);
-    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
-    int has_sme = TestCpuFlag(kCpuHasSME);
-    printf("Has Arm 0x%x\n", has_arm);
-    printf("Has Neon 0x%x\n", has_neon);
-    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
-    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
-    printf("Has SVE 0x%x\n", has_sve);
-    printf("Has SVE2 0x%x\n", has_sve2);
-    printf("Has SME 0x%x\n", has_sme);
 
-#if __aarch64__
-    // Read and print the SVE and SME vector lengths.
-    if (has_sve) {
-      int sve_vl;
-      // rdvl x0, #1
-      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
-      printf("SVE vector length: %d bytes\n", sve_vl);
-    }
-    if (has_sme) {
-      int sme_vl;
-      // rdsvl x0, #1
-      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
-      printf("SME vector length: %d bytes\n", sme_vl);
-    }
-#endif
-  }
-  if (has_riscv) {
-    int has_rvv = TestCpuFlag(kCpuHasRVV);
-    printf("Has RISCV 0x%x\n", has_riscv);
-    printf("Has RVV 0x%x\n", has_rvv);
-  }
-  if (has_mips) {
-    int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MIPS 0x%x\n", has_mips);
-    printf("Has MSA 0x%x\n", has_msa);
-  }
-  if (has_loongarch) {
-    int has_lsx  = TestCpuFlag(kCpuHasLSX);
-    int has_lasx = TestCpuFlag(kCpuHasLASX);
-    printf("Has LOONGARCH 0x%x\n", has_loongarch);
-    printf("Has LSX 0x%x\n", has_lsx);
-    printf("Has LASX 0x%x\n", has_lasx);
-  }
-  if (has_x86) {
     int has_sse2 = TestCpuFlag(kCpuHasSSE2);
     int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
     int has_sse41 = TestCpuFlag(kCpuHasSSE41);
@@ -182,6 +198,7 @@ int main(int argc, const char* argv[]) {
     printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
     printf("Has AMXINT8 0x%x\n", has_amxint8);
   }
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
   return 0;
 }