cpuid show vector length on ARM and RISCV

- additional asm volatile changes from github - rotate mips remove C function - moved to common Run on Samsung S22 [ RUN ] LibYUVBaseTest.TestCpuHas Kernel Version 5.10 Has Arm 0x2 Has Neon 0x4 Has Neon DotProd 0x10 Has Neon I8MM 0x20 Has SVE 0x40 Has SVE2 0x80 Has SME 0x0 SVE vector length: 16 bytes [ OK ] LibYUVBaseTest.TestCpuHas (0 ms) [ RUN ] LibYUVBaseTest.TestCompilerMacros __ATOMIC_RELAXED 0 __cplusplus 201703 __clang_major__ 17 __clang_minor__ 0 __GNUC__ 4 __GNUC_MINOR__ 2 __aarch64__ 1 __clang__ 1 __llvm__ 1 __pic__ 2 INT_TYPES_DEFINED __has_feature Run on RISCV qemu emulating SiFive X280: [ RUN ] LibYUVBaseTest.TestCpuHas Kernel Version 6.6 Has RISCV 0x10000000 Has RVV 0x20000000 RVV vector length: 64 bytes [ OK ] LibYUVBaseTest.TestCpuHas (4 ms) [ RUN ] LibYUVBaseTest.TestCompilerMacros __ATOMIC_RELAXED 0 __cplusplus 202002 __clang_major__ 9999 __clang_minor__ 0 __GNUC__ 4 __GNUC_MINOR__ 2 __riscv 1 __riscv_vector 1 __riscv_v_intrinsic 12000 __riscv_zve64x 1000000 __clang__ 1 __llvm__ 1 __pic__ 2 INT_TYPES_DEFINED __has_feature Bug: b/42280943 Change-Id: I53cf0450be4965a28942e113e4c77295ace70999 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5672088 Reviewed-by: David Gao <davidgao@google.com>
2026-01-01 03:12:16 +08:00 · 2024-07-02 10:39:04 -07:00 · 2024-07-02 10:39:04 -07:00 · fa16ddbb9f
commit fa16ddbb9f
parent 616bee5420
7 changed files with 373 additions and 213 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1889
+Version: 1890
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1889
+#define LIBYUV_VERSION 1890

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/rotate_msa.cc
+++ b/source/rotate_msa.cc
@ -51,16 +51,6 @@ extern "C" {
    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
  }

-void TransposeWx16_C(const uint8_t* src,
-                     int src_stride,
-                     uint8_t* dst,
-                     int dst_stride,
-                     int width) {
-  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
-  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
-                 width);
-}
-
 void TransposeUVWx16_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst_a,
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -1090,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9,  9,  8,  8,  11, 11,
 void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
                         uint16_t* dst_ar64,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqa      %%xmm0,%%xmm1                 \n"
@ -1137,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
 void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
                         uint8_t* dst_argb,
                         int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1186,7 +1186,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
 void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
                        uint16_t* dst_ar64,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
@ -1239,7 +1239,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
                        uint8_t* dst_argb,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -5428,7 +5428,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -5502,7 +5502,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_rgb,
                       int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      (%1),%%xmm1                   \n"
@ -5598,7 +5598,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
                       const uint8_t* src_b,
                       uint8_t* dst_argb,
                       int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"

      "movq        (%2),%%xmm0                   \n"  // B
@ -5776,7 +5776,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
                       uint8_t* dst_g,
                       uint8_t* dst_b,
                       int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"

      "movdqu      (%0),%%xmm0                   \n"  // 00-0F
@ -6350,7 +6350,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

 #ifdef HAS_COPYROW_AVX
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -6451,7 +6451,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
                              uint8_t* dst_a,
                              int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0), %%xmm0                  \n"
      "movdqu      0x10(%0), %%xmm1              \n"
@ -6642,7 +6642,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
                        int stride_yuy2,
                        uint8_t* dst_uv,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -6741,7 +6741,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
 }

 void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -6867,7 +6867,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
                        int stride_yuy2,
                        uint8_t* dst_uv,
                        int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -6970,7 +6970,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
 }

 void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
--- a/source/scale_gcc.cc
+++ b/source/scale_gcc.cc
@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-  asm(
+  asm volatile (
      // 16 pixel loop.
      LABELALIGN
      "1:                                        \n"
@ -123,7 +123,8 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
  (void)src_stride;
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -153,7 +154,8 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
                            ptrdiff_t src_stride,
                            uint8_t* dst_ptr,
                            int dst_width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -193,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vmovdqu     0x20(%0),%%ymm1               \n"
@ -219,7 +221,8 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
  (void)src_stride;
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@ -251,7 +254,8 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst_ptr,
                           int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
@ -293,7 +297,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
                         uint8_t* dst_ptr,
                         int dst_width) {
  (void)src_stride;
-      asm("pcmpeqb     %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm5,%%xmm5                 \n"
      "psrld       $0x18,%%xmm5                  \n"
      "pslld       $0x10,%%xmm5                  \n"

@ -323,7 +328,8 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
                            uint8_t* dst_ptr,
                            int dst_width) {
  intptr_t stridex3;
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"
      "psrlw       $0xf,%%xmm4                   \n"
      "movdqa      %%xmm4,%%xmm5                 \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
@ -377,7 +383,8 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
                        uint8_t* dst_ptr,
                        int dst_width) {
  (void)src_stride;
-      asm("vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm5,%%ymm5,%%ymm5          \n"
      "vpsrld      $0x18,%%ymm5,%%ymm5           \n"
      "vpslld      $0x10,%%ymm5,%%ymm5           \n"

@ -409,7 +416,8 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
                           ptrdiff_t src_stride,
                           uint8_t* dst_ptr,
                           int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpsllw      $0x3,%%ymm4,%%ymm5            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
@ -464,7 +472,8 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
  (void)src_stride;
-      asm("movdqa      %0,%%xmm3                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm3                     \n"
      "movdqa      %1,%%xmm4                     \n"
      "movdqa      %2,%%xmm5                     \n"
      :
@ -472,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf1),  // %1
        "m"(kShuf2)   // %2
  );
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm2               \n"
@ -499,7 +508,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
      :
@ -507,7 +517,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf11),  // %1
        "m"(kShuf21)   // %2
  );
-      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
+  asm volatile (
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
      "movdqa      %2,%%xmm1                     \n"  // kRound34
      :
@ -515,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kMadd11),  // %1
        "m"(kRound34)  // %2
  );
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -561,7 +572,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"  // kShuf01
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"  // kShuf01
      "movdqa      %1,%%xmm3                     \n"  // kShuf11
      "movdqa      %2,%%xmm4                     \n"  // kShuf21
      :
@ -569,7 +581,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShuf11),  // %1
        "m"(kShuf21)   // %2
  );
-      asm("movdqa      %0,%%xmm5                     \n"  // kMadd01
+  asm volatile (
+      "movdqa      %0,%%xmm5                     \n"  // kMadd01
      "movdqa      %1,%%xmm0                     \n"  // kMadd11
      "movdqa      %2,%%xmm1                     \n"  // kRound34
      :
@ -578,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kRound34)  // %2
  );

-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm6                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm7          \n"
@ -628,7 +641,8 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
                          uint8_t* dst_ptr,
                          int dst_width) {
  (void)src_stride;
-      asm("movdqa      %3,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm4                     \n"
      "movdqa      %4,%%xmm5                     \n"

      LABELALIGN
@ -657,7 +671,8 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"
      "movdqa      %1,%%xmm3                     \n"
      "movdqa      %2,%%xmm4                     \n"
      "movdqa      %3,%%xmm5                     \n"
@ -667,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAb2),  // %2
        "m"(kScaleAb2)  // %3
  );
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm1          \n"
@ -699,7 +714,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("movdqa      %0,%%xmm2                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm2                     \n"
      "movdqa      %1,%%xmm3                     \n"
      "movdqa      %2,%%xmm4                     \n"
      "pxor        %%xmm5,%%xmm5                 \n"
@ -708,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
        "m"(kShufAc3),   // %1
        "m"(kScaleAc33)  // %2
  );
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x00(%0,%3,1),%%xmm6          \n"
@ -766,7 +782,8 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
-      asm("pxor        %%xmm0,%%xmm0                 \n"  // 0
+  asm volatile (
+      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $1,%%xmm6                     \n"  // all 2
@ -821,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "pxor        %%xmm0,%%xmm0                 \n"  // 0
      // above line
@ -934,7 +951,8 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
                                 uint16_t* dst_ptr,
                                 int dst_width) {
-      asm("movdqa      %3,%%xmm5                     \n"
+  asm volatile (
+      "movdqa      %3,%%xmm5                     \n"
      "pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
@ -985,7 +1003,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   ptrdiff_t dst_stride,
                                   int dst_width) {
-      asm("pcmpeqw     %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm7,%%xmm7                 \n"
      "psrlw       $15,%%xmm7                    \n"
      "psllw       $3,%%xmm7                     \n"  // all 8
      "movdqa      %5,%%xmm6                     \n"
@ -1082,7 +1101,8 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
      "psrld       $31,%%xmm4                    \n"
      "pslld       $1,%%xmm4                     \n"  // all 2
@ -1134,7 +1154,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-      asm("pxor        %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pxor        %%xmm7,%%xmm7                 \n"
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
      "psrld       $31,%%xmm6                    \n"
      "pslld       $3,%%xmm6                     \n"  // all 8
@ -1241,7 +1262,8 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                              uint8_t* dst_ptr,
                              int dst_width) {
-      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
      "movdqa      %3,%%xmm3                     \n"
@ -1281,7 +1303,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                ptrdiff_t dst_stride,
                                int dst_width) {
-      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $3,%%xmm6                     \n"  // all 8
      "movdqa      %5,%%xmm7                     \n"
@ -1365,7 +1388,8 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int dst_width) {
-      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
      "vbroadcastf128 %3,%%ymm3                  \n"
@ -1408,7 +1432,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               ptrdiff_t dst_stride,
                               int dst_width) {
-      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
      "vbroadcastf128 %5,%%ymm7                  \n"
@ -1489,7 +1514,8 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-      asm("vbroadcastf128 %3,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %3,%%ymm5                  \n"
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
@ -1540,7 +1566,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-      asm("vbroadcastf128 %5,%%ymm5                  \n"
+  asm volatile (
+      "vbroadcastf128 %5,%%ymm5                  \n"
      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $3,%%ymm4,%%ymm4              \n"  // all 8
@ -1601,7 +1628,8 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                uint16_t* dst_ptr,
                                int dst_width) {
-      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2

@ -1650,7 +1678,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8

@ -1732,7 +1761,8 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
 void ScaleAddRow_SSE2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"

      // 16 pixel loop.
      LABELALIGN
@ -1763,7 +1793,8 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
 void ScaleAddRow_AVX2(const uint8_t* src_ptr,
                      uint16_t* dst_ptr,
                      int src_width) {
-      asm("vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+  asm volatile (
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"

      LABELALIGN
      "1:                                        \n"
@ -1804,7 +1835,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
                           int x,
                           int dx) {
  intptr_t x0, x1, temp_pixel;
-      asm("movd        %6,%%xmm2                     \n"
+  asm volatile (
+      "movd        %6,%%xmm2                     \n"
      "movd        %7,%%xmm3                     \n"
      "movl        $0x04040000,%k2               \n"
      "movd        %k2,%%xmm5                    \n"
@ -1900,7 +1932,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
                       int dx) {
  (void)x;
  (void)dx;
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -1925,7 +1957,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                            uint8_t* dst_argb,
                            int dst_width) {
  (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1947,7 +1979,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                  uint8_t* dst_argb,
                                  int dst_width) {
  (void)src_stride;
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -1971,7 +2003,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                               ptrdiff_t src_stride,
                               uint8_t* dst_argb,
                               int dst_width) {
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"
      "movdqu      0x10(%0),%%xmm1               \n"
@ -2005,7 +2037,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  intptr_t src_stepx_x12;
  (void)src_stride;
-      asm("lea         0x00(,%1,4),%1                \n"
+  asm volatile (
+      "lea         0x00(,%1,4),%1                \n"
      "lea         0x00(%1,%1,2),%4              \n"

      LABELALIGN
@ -2041,7 +2074,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
  intptr_t src_stepx_x12;
  intptr_t row1 = (intptr_t)(src_stride);
-      asm("lea         0x00(,%1,4),%1                \n"
+  asm volatile (
+      "lea         0x00(,%1,4),%1                \n"
      "lea         0x00(%1,%1,2),%4              \n"
      "lea         0x00(%0,%5,1),%5              \n"

@ -2083,7 +2117,8 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
                        int x,
                        int dx) {
  intptr_t x0, x1;
-      asm("movd        %5,%%xmm2                     \n"
+  asm volatile (
+      "movd        %5,%%xmm2                     \n"
      "movd        %6,%%xmm3                     \n"
      "pshufd      $0x0,%%xmm2,%%xmm2            \n"
      "pshufd      $0x11,%%xmm3,%%xmm0           \n"
@ -2153,7 +2188,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
                           int dx) {
  (void)x;
  (void)dx;
-  asm volatile (LABELALIGN
+  asm volatile (
      "1:                                        \n"
      "movdqu      (%1),%%xmm0                   \n"
      "lea         0x10(%1),%1                   \n"
@ -2191,14 +2226,16 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
                               int x,
                               int dx) {
  intptr_t x0, x1;
-      asm("movdqa      %0,%%xmm4                     \n"
+  asm volatile (
+      "movdqa      %0,%%xmm4                     \n"
      "movdqa      %1,%%xmm5                     \n"
      :
      : "m"(kShuffleColARGB),   // %0
        "m"(kShuffleFractions)  // %1
  );

-      asm("movd        %5,%%xmm2                     \n"
+  asm volatile (
+      "movd        %5,%%xmm2                     \n"
      "movd        %6,%%xmm3                     \n"
      "pcmpeqb     %%xmm6,%%xmm6                 \n"
      "psrlw       $0x9,%%xmm6                   \n"
@ -2260,7 +2297,8 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,

 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-      asm("cdq                                       \n"
+  asm volatile (
+      "cdq                                       \n"
      "shld        $0x10,%%eax,%%edx             \n"
      "shl         $0x10,%%eax                   \n"
      "idiv        %1                            \n"
@ -2273,7 +2311,8 @@ int FixedDiv_X86(int num, int div) {

 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-      asm("cdq                                       \n"
+  asm volatile (
+      "cdq                                       \n"
      "shld        $0x10,%%eax,%%edx             \n"
      "shl         $0x10,%%eax                   \n"
      "sub         $0x10001,%%eax                \n"
@ -2304,7 +2343,8 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
                              ptrdiff_t src_stride,
                              uint8_t* dst_ptr,
                              int dst_width) {
-      asm("pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
+  asm volatile (
+      "pcmpeqb     %%xmm4,%%xmm4                 \n"  // 01010101
      "psrlw       $0xf,%%xmm4                   \n"
      "packuswb    %%xmm4,%%xmm4                 \n"
      "pxor        %%xmm5, %%xmm5                \n"  // zero
@ -2343,7 +2383,8 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
                             uint8_t* dst_ptr,
                             int dst_width) {
-      asm("vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
+  asm volatile (
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"  // 01010101
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"  // zero
@ -2386,7 +2427,8 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
                                uint8_t* dst_ptr,
                                int dst_width) {
-      asm("pcmpeqw     %%xmm4,%%xmm4                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm4,%%xmm4                 \n"
      "psrlw       $15,%%xmm4                    \n"
      "psllw       $1,%%xmm4                     \n"  // all 2
      "movdqa      %3,%%xmm3                     \n"
@ -2426,7 +2468,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
                                  uint8_t* dst_ptr,
                                  ptrdiff_t dst_stride,
                                  int dst_width) {
-      asm("pcmpeqw     %%xmm6,%%xmm6                 \n"
+  asm volatile (
+      "pcmpeqw     %%xmm6,%%xmm6                 \n"
      "psrlw       $15,%%xmm6                    \n"
      "psllw       $3,%%xmm6                     \n"  // all 8
      "movdqa      %5,%%xmm7                     \n"
@ -2509,7 +2552,8 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
                               uint8_t* dst_ptr,
                               int dst_width) {
-      asm("vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $15,%%ymm4,%%ymm4             \n"
      "vpsllw      $1,%%ymm4,%%ymm4              \n"  // all 2
      "vbroadcastf128 %3,%%ymm3                  \n"
@ -2551,7 +2595,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
                                 uint8_t* dst_ptr,
                                 ptrdiff_t dst_stride,
                                 int dst_width) {
-      asm("vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqw    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrlw      $15,%%ymm6,%%ymm6             \n"
      "vpsllw      $3,%%ymm6,%%ymm6              \n"  // all 8
      "vbroadcastf128 %5,%%ymm7                  \n"
@ -2630,7 +2675,8 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
                                   uint16_t* dst_ptr,
                                   int dst_width) {
-      asm("pxor        %%xmm5,%%xmm5                 \n"
+  asm volatile (
+      "pxor        %%xmm5,%%xmm5                 \n"
      "pcmpeqd     %%xmm4,%%xmm4                 \n"
      "psrld       $31,%%xmm4                    \n"
      "pslld       $1,%%xmm4                     \n"  // all 2
@ -2681,7 +2727,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
                                     uint16_t* dst_ptr,
                                     ptrdiff_t dst_stride,
                                     int dst_width) {
-      asm("pxor        %%xmm7,%%xmm7                 \n"
+  asm volatile (
+      "pxor        %%xmm7,%%xmm7                 \n"
      "pcmpeqd     %%xmm6,%%xmm6                 \n"
      "psrld       $31,%%xmm6                    \n"
      "pslld       $3,%%xmm6                     \n"  // all 8
@ -2771,7 +2818,8 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
                                  uint16_t* dst_ptr,
                                  int dst_width) {
-      asm("vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrld      $31,%%ymm4,%%ymm4             \n"
      "vpslld      $1,%%ymm4,%%ymm4              \n"  // all 2

@ -2819,7 +2867,8 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
                                    uint16_t* dst_ptr,
                                    ptrdiff_t dst_stride,
                                    int dst_width) {
-      asm("vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
+  asm volatile (
+      "vpcmpeqd    %%ymm6,%%ymm6,%%ymm6          \n"
      "vpsrld      $31,%%ymm6,%%ymm6             \n"
      "vpslld      $3,%%ymm6,%%ymm6              \n"  // all 8

--- a/unit_test/cpu_test.cc
+++ b/unit_test/cpu_test.cc
@ -11,6 +11,11 @@
 #include <stdlib.h>
 #include <string.h>

+#ifdef __linux__
+#include <ctype.h>
+#include <sys/utsname.h>
+#endif
+
 #include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
@ -18,80 +23,179 @@

 namespace libyuv {

+#ifdef __linux__
+static void KernelVersion(int *version) {
+  struct utsname buffer;
+  int i = 0;
+
+  version[0] = version[1] = 0;
+  if (uname(&buffer) == 0) {
+    char *v = buffer.release;
+    for (i = 0; *v && i < 2; ++v) {
+      if (isdigit(*v)) {
+        version[i++] = (int) strtol(v, &v, 10);
+      }
+    }
+  }
+}
+#endif
+
 TEST_F(LibYUVBaseTest, TestCpuHas) {
-  int cpu_flags = TestCpuFlag(-1);
-  printf("Cpu Flags 0x%x\n", cpu_flags);
+#if defined(__linux__)
+  {
+    int kernelversion[2];
+    KernelVersion(kernelversion);
+    printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
+  }
+#endif  // defined(__linux__)
+
 #if defined(__arm__) || defined(__aarch64__)
  int has_arm = TestCpuFlag(kCpuHasARM);
-  printf("Has ARM 0x%x\n", has_arm);
-  int has_neon = TestCpuFlag(kCpuHasNEON);
-  printf("Has NEON 0x%x\n", has_neon);
-#endif
-#if defined(__riscv) && defined(__linux__)
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
+    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
+    int has_sve = TestCpuFlag(kCpuHasSVE);
+    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
+    int has_sme = TestCpuFlag(kCpuHasSME);
+    printf("Has Arm 0x%x\n", has_arm);
+    printf("Has Neon 0x%x\n", has_neon);
+    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
+    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
+    printf("Has SVE 0x%x\n", has_sve);
+    printf("Has SVE2 0x%x\n", has_sve2);
+    printf("Has SME 0x%x\n", has_sme);
+
+#if defined(__aarch64__)
+    // Read and print the SVE and SME vector lengths.
+    if (has_sve) {
+      int sve_vl;
+      // rdvl x0, #1
+      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
+      printf("SVE vector length: %d bytes\n", sve_vl);
+    }
+    if (has_sme) {
+      int sme_vl;
+      // rdsvl x0, #1
+      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
+      printf("SME vector length: %d bytes\n", sme_vl);
+    }
+#endif  // defined(__aarch64__)
+  }
+#endif  // if defined(__arm__) || defined(__aarch64__)
+
+#if defined(__riscv)
  int has_riscv = TestCpuFlag(kCpuHasRISCV);
-  printf("Has RISCV 0x%x\n", has_riscv);
-  int has_rvv = TestCpuFlag(kCpuHasRVV);
-  printf("Has RVV 0x%x\n", has_rvv);
-  int has_rvvzvfh = TestCpuFlag(kCpuHasRVVZVFH);
-  printf("Has RVVZVFH 0x%x\n", has_rvvzvfh);
-#endif
-#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
-    defined(_M_X64)
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  int has_sse2 = TestCpuFlag(kCpuHasSSE2);
-  int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
-  int has_sse41 = TestCpuFlag(kCpuHasSSE41);
-  int has_sse42 = TestCpuFlag(kCpuHasSSE42);
-  int has_avx = TestCpuFlag(kCpuHasAVX);
-  int has_avx2 = TestCpuFlag(kCpuHasAVX2);
-  int has_erms = TestCpuFlag(kCpuHasERMS);
-  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
-  int has_f16c = TestCpuFlag(kCpuHasF16C);
-  int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
-  int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
-  int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
-  int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
-  int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
-  int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
-  int has_avx10 = TestCpuFlag(kCpuHasAVX10);
-  int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
-  int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
-  int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
-  printf("Has X86 0x%x\n", has_x86);
-  printf("Has SSE2 0x%x\n", has_sse2);
-  printf("Has SSSE3 0x%x\n", has_ssse3);
-  printf("Has SSE41 0x%x\n", has_sse41);
-  printf("Has SSE42 0x%x\n", has_sse42);
-  printf("Has AVX 0x%x\n", has_avx);
-  printf("Has AVX2 0x%x\n", has_avx2);
-  printf("Has ERMS 0x%x\n", has_erms);
-  printf("Has FMA3 0x%x\n", has_fma3);
-  printf("Has F16C 0x%x\n", has_f16c);
-  printf("Has AVX512BW 0x%x\n", has_avx512bw);
-  printf("Has AVX512VL 0x%x\n", has_avx512vl);
-  printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
-  printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
-  printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
-  printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
-  printf("Has AVX10 0x%x\n", has_avx10);
-  printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
-  printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
-  printf("Has AMXINT8 0x%x\n", has_amxint8);
-#endif
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
+
+    // Read and print the RVV vector length.
+    if (has_rvv) {
+      register uint32_t vlenb __asm__ ("t0");
+      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      printf("RVV vector length: %d bytes\n", vlenb);
+    }
+  }
+#endif  // defined(__riscv)
+
 #if defined(__mips__)
  int has_mips = TestCpuFlag(kCpuHasMIPS);
-  printf("Has MIPS 0x%x\n", has_mips);
-  int has_msa = TestCpuFlag(kCpuHasMSA);
-  printf("Has MSA 0x%x\n", has_msa);
-#endif
+  if (has_mips) {
+    int has_msa = TestCpuFlag(kCpuHasMSA);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+#endif  // defined(__mips__)
+
 #if defined(__loongarch__)
  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
-  printf("Has LOONGARCH 0x%x\n", has_loongarch);
-  int has_lsx = TestCpuFlag(kCpuHasLSX);
-  printf("Has LSX 0x%x\n", has_lsx);
-  int has_lasx = TestCpuFlag(kCpuHasLASX);
-  printf("Has LASX 0x%x\n", has_lasx);
-#endif
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
+  }
+#endif  // defined(__loongarch__)
+
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(_M_IX86) || defined(_M_X64)
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  if (has_x86) {
+    int family, model, cpu_info[4];
+    // Vendor ID:
+    // AuthenticAMD AMD processor
+    // CentaurHauls Centaur processor
+    // CyrixInstead Cyrix processor
+    // GenuineIntel Intel processor
+    // GenuineTMx86 Transmeta processor
+    // Geode by NSC National Semiconductor processor
+    // NexGenDriven NexGen processor
+    // RiseRiseRise Rise Technology processor
+    // SiS SiS SiS  SiS processor
+    // UMC UMC UMC  UMC processor
+    CpuId(0, 0, &cpu_info[0]);
+    cpu_info[0] = cpu_info[1];  // Reorder output
+    cpu_info[1] = cpu_info[3];
+    cpu_info[3] = 0;
+    printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+    // CPU Family and Model
+    // 3:0 - Stepping
+    // 7:4 - Model
+    // 11:8 - Family
+    // 13:12 - Processor Type
+    // 19:16 - Extended Model
+    // 27:20 - Extended Family
+    CpuId(1, 0, &cpu_info[0]);
+    family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+           model, model);
+
+    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+    int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+    int has_avx = TestCpuFlag(kCpuHasAVX);
+    int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+    int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+    int has_f16c = TestCpuFlag(kCpuHasF16C);
+    int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
+    int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL);
+    int has_avx512vnni = TestCpuFlag(kCpuHasAVX512VNNI);
+    int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI);
+    int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2);
+    int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG);
+    int has_avx10 = TestCpuFlag(kCpuHasAVX10);
+    int has_avxvnni = TestCpuFlag(kCpuHasAVXVNNI);
+    int has_avxvnniint8 = TestCpuFlag(kCpuHasAVXVNNIINT8);
+    int has_amxint8 = TestCpuFlag(kCpuHasAMXINT8);
+    printf("Has X86 0x%x\n", has_x86);
+    printf("Has SSE2 0x%x\n", has_sse2);
+    printf("Has SSSE3 0x%x\n", has_ssse3);
+    printf("Has SSE4.1 0x%x\n", has_sse41);
+    printf("Has SSE4.2 0x%x\n", has_sse42);
+    printf("Has AVX 0x%x\n", has_avx);
+    printf("Has AVX2 0x%x\n", has_avx2);
+    printf("Has ERMS 0x%x\n", has_erms);
+    printf("Has FMA3 0x%x\n", has_fma3);
+    printf("Has F16C 0x%x\n", has_f16c);
+    printf("Has AVX512BW 0x%x\n", has_avx512bw);
+    printf("Has AVX512VL 0x%x\n", has_avx512vl);
+    printf("Has AVX512VNNI 0x%x\n", has_avx512vnni);
+    printf("Has AVX512VBMI 0x%x\n", has_avx512vbmi);
+    printf("Has AVX512VBMI2 0x%x\n", has_avx512vbmi2);
+    printf("Has AVX512VBITALG 0x%x\n", has_avx512vbitalg);
+    printf("Has AVX10 0x%x\n", has_avx10);
+    printf("HAS AVXVNNI 0x%x\n", has_avxvnni);
+    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
+    printf("Has AMXINT8 0x%x\n", has_amxint8);
+  }
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
 }

 TEST_F(LibYUVBaseTest, TestCompilerMacros) {
--- a/util/cpuid.c
+++ b/util/cpuid.c
@ -41,24 +41,91 @@ static void KernelVersion(int *version) {
 #endif

 int main(int argc, const char* argv[]) {
-  int cpu_flags = TestCpuFlag(-1);
-  int has_arm = TestCpuFlag(kCpuHasARM);
-  int has_riscv = TestCpuFlag(kCpuHasRISCV);
-  int has_x86 = TestCpuFlag(kCpuHasX86);
-  int has_mips = TestCpuFlag(kCpuHasMIPS);
-  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
  (void)argc;
  (void)argv;

-#ifdef __linux__
+#if defined(__linux__)
  {
    int kernelversion[2];
    KernelVersion(kernelversion);
    printf("Kernel Version %d.%d\n", kernelversion[0], kernelversion[1]);
  }
-#endif
+#endif  // defined(__linux__)
+
+#if defined(__arm__) || defined(__aarch64__)
+  int has_arm = TestCpuFlag(kCpuHasARM);
+  if (has_arm) {
+    int has_neon = TestCpuFlag(kCpuHasNEON);
+    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
+    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
+    int has_sve = TestCpuFlag(kCpuHasSVE);
+    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
+    int has_sme = TestCpuFlag(kCpuHasSME);
+    printf("Has Arm 0x%x\n", has_arm);
+    printf("Has Neon 0x%x\n", has_neon);
+    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
+    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
+    printf("Has SVE 0x%x\n", has_sve);
+    printf("Has SVE2 0x%x\n", has_sve2);
+    printf("Has SME 0x%x\n", has_sme);
+
+#if defined(__aarch64__)
+    // Read and print the SVE and SME vector lengths.
+    if (has_sve) {
+      int sve_vl;
+      // rdvl x0, #1
+      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
+      printf("SVE vector length: %d bytes\n", sve_vl);
+    }
+    if (has_sme) {
+      int sme_vl;
+      // rdsvl x0, #1
+      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
+      printf("SME vector length: %d bytes\n", sme_vl);
+    }
+#endif  // defined(__aarch64__)
+  }
+#endif  // if defined(__arm__) || defined(__aarch64__)
+
+#if defined(__riscv)
+  int has_riscv = TestCpuFlag(kCpuHasRISCV);
+  if (has_riscv) {
+    int has_rvv = TestCpuFlag(kCpuHasRVV);
+    printf("Has RISCV 0x%x\n", has_riscv);
+    printf("Has RVV 0x%x\n", has_rvv);
+
+    // Read and print the RVV vector length.
+    if (has_rvv) {
+      register uint32_t vlenb __asm__ ("t0");
+      __asm__(".word 0xC22022F3"  /* CSRR t0, vlenb */ : "=r" (vlenb));
+      printf("RVV vector length: %d bytes\n", vlenb);
+    }
+  }
+#endif  // defined(__riscv)
+
+#if defined(__mips__)
+  int has_mips = TestCpuFlag(kCpuHasMIPS);
+  if (has_mips) {
+    int has_msa = TestCpuFlag(kCpuHasMSA);
+    printf("Has MIPS 0x%x\n", has_mips);
+    printf("Has MSA 0x%x\n", has_msa);
+  }
+#endif  // defined(__mips__)
+
+#if defined(__loongarch__)
+  int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
+  if (has_loongarch) {
+    int has_lsx  = TestCpuFlag(kCpuHasLSX);
+    int has_lasx = TestCpuFlag(kCpuHasLASX);
+    printf("Has LOONGARCH 0x%x\n", has_loongarch);
+    printf("Has LSX 0x%x\n", has_lsx);
+    printf("Has LASX 0x%x\n", has_lasx);
+  }
+#endif  // defined(__loongarch__)
+
 #if defined(__i386__) || defined(__x86_64__) || \
    defined(_M_IX86) || defined(_M_X64)
+  int has_x86 = TestCpuFlag(kCpuHasX86);
  if (has_x86) {
    int family, model, cpu_info[4];
    // Vendor ID:
@ -90,58 +157,7 @@ int main(int argc, const char* argv[]) {
    model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
    printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
           model, model);
-  }
-#endif
-  printf("Cpu Flags 0x%x\n", cpu_flags);
-  if (has_arm) {
-    int has_neon = TestCpuFlag(kCpuHasNEON);
-    int has_neon_dotprod = TestCpuFlag(kCpuHasNeonDotProd);
-    int has_neon_i8mm = TestCpuFlag(kCpuHasNeonI8MM);
-    int has_sve = TestCpuFlag(kCpuHasSVE);
-    int has_sve2 = TestCpuFlag(kCpuHasSVE2);
-    int has_sme = TestCpuFlag(kCpuHasSME);
-    printf("Has Arm 0x%x\n", has_arm);
-    printf("Has Neon 0x%x\n", has_neon);
-    printf("Has Neon DotProd 0x%x\n", has_neon_dotprod);
-    printf("Has Neon I8MM 0x%x\n", has_neon_i8mm);
-    printf("Has SVE 0x%x\n", has_sve);
-    printf("Has SVE2 0x%x\n", has_sve2);
-    printf("Has SME 0x%x\n", has_sme);

-#if __aarch64__
-    // Read and print the SVE and SME vector lengths.
-    if (has_sve) {
-      int sve_vl;
-      // rdvl x0, #1
-      asm(".inst 0x04bf5020; mov %w0, w0" : "=r"(sve_vl)::"x0");
-      printf("SVE vector length: %d bytes\n", sve_vl);
-    }
-    if (has_sme) {
-      int sme_vl;
-      // rdsvl x0, #1
-      asm(".inst 0x04bf5820; mov %w0, w0" : "=r"(sme_vl)::"x0");
-      printf("SME vector length: %d bytes\n", sme_vl);
-    }
-#endif
-  }
-  if (has_riscv) {
-    int has_rvv = TestCpuFlag(kCpuHasRVV);
-    printf("Has RISCV 0x%x\n", has_riscv);
-    printf("Has RVV 0x%x\n", has_rvv);
-  }
-  if (has_mips) {
-    int has_msa = TestCpuFlag(kCpuHasMSA);
-    printf("Has MIPS 0x%x\n", has_mips);
-    printf("Has MSA 0x%x\n", has_msa);
-  }
-  if (has_loongarch) {
-    int has_lsx  = TestCpuFlag(kCpuHasLSX);
-    int has_lasx = TestCpuFlag(kCpuHasLASX);
-    printf("Has LOONGARCH 0x%x\n", has_loongarch);
-    printf("Has LSX 0x%x\n", has_lsx);
-    printf("Has LASX 0x%x\n", has_lasx);
-  }
-  if (has_x86) {
    int has_sse2 = TestCpuFlag(kCpuHasSSE2);
    int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
    int has_sse41 = TestCpuFlag(kCpuHasSSE41);
@ -182,6 +198,7 @@ int main(int argc, const char* argv[]) {
    printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
    printf("Has AMXINT8 0x%x\n", has_amxint8);
  }
+#endif  // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
  return 0;
 }