diff --git a/README.chromium b/README.chromium index 0fa68a010..3f160122a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1897 +Version: 1898 License: BSD License File: LICENSE Shipped: yes diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 8a295658d..722ec3276 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -23,49 +23,50 @@ static const int kCpuInitialized = 0x1; // These flags are only valid on Arm processors. static const int kCpuHasARM = 0x2; -static const int kCpuHasNEON = 0x4; -// Leave a gap to avoid setting kCpuHasX86. -static const int kCpuHasNeonDotProd = 0x10; -static const int kCpuHasNeonI8MM = 0x20; -static const int kCpuHasSVE = 0x40; -static const int kCpuHasSVE2 = 0x80; -static const int kCpuHasSME = 0x100; - -// These flags are only valid on x86 processors. -static const int kCpuHasX86 = 0x8; -static const int kCpuHasSSE2 = 0x10; -static const int kCpuHasSSSE3 = 0x20; -static const int kCpuHasSSE41 = 0x40; -static const int kCpuHasSSE42 = 0x80; -static const int kCpuHasAVX = 0x100; -static const int kCpuHasAVX2 = 0x200; -static const int kCpuHasERMS = 0x400; -static const int kCpuHasFMA3 = 0x800; -static const int kCpuHasF16C = 0x1000; -static const int kCpuHasAVX512BW = 0x2000; -static const int kCpuHasAVX512VL = 0x4000; -static const int kCpuHasAVX512VNNI = 0x8000; -static const int kCpuHasAVX512VBMI = 0x10000; -static const int kCpuHasAVX512VBMI2 = 0x20000; -static const int kCpuHasAVX512VBITALG = 0x40000; -static const int kCpuHasAVX10 = 0x80000; -static const int kCpuHasAVXVNNI = 0x100000; -static const int kCpuHasAVXVNNIINT8 = 0x200000; -static const int kCpuHasAMXINT8 = 0x400000; - -// These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x800000; -static const int kCpuHasMSA = 0x1000000; - -// These flags are only valid on LOONGARCH processors. -static const int kCpuHasLOONGARCH = 0x2000000; -static const int kCpuHasLSX = 0x4000000; -static const int kCpuHasLASX = 0x8000000; +static const int kCpuHasNEON = 0x100; +static const int kCpuHasNeonDotProd = 0x200; +static const int kCpuHasNeonI8MM = 0x400; +static const int kCpuHasSVE = 0x800; +static const int kCpuHasSVE2 = 0x1000; +static const int kCpuHasSME = 0x2000; // These flags are only valid on RISCV processors. -static const int kCpuHasRISCV = 0x10000000; -static const int kCpuHasRVV = 0x20000000; -static const int kCpuHasRVVZVFH = 0x40000000; +static const int kCpuHasRISCV = 0x8; +static const int kCpuHasRVV = 0x100; +static const int kCpuHasRVVZVFH = 0x200; + +// These flags are only valid on x86 processors. +static const int kCpuHasX86 = 0x10; +static const int kCpuHasSSE2 = 0x100; +static const int kCpuHasSSSE3 = 0x200; +static const int kCpuHasSSE41 = 0x400; +static const int kCpuHasSSE42 = 0x800; +static const int kCpuHasAVX = 0x1000; +static const int kCpuHasAVX2 = 0x2000; +static const int kCpuHasERMS = 0x4000; +static const int kCpuHasFSMR = 0x8000; +static const int kCpuHasFMA3 = 0x10000; +static const int kCpuHasF16C = 0x20000; +static const int kCpuHasAVX512BW = 0x40000; +static const int kCpuHasAVX512VL = 0x80000; +static const int kCpuHasAVX512VNNI = 0x100000; +static const int kCpuHasAVX512VBMI = 0x200000; +static const int kCpuHasAVX512VBMI2 = 0x400000; +static const int kCpuHasAVX512VBITALG = 0x800000; +static const int kCpuHasAVX10 = 0x1000000; +static const int kCpuHasAVXVNNI = 0x2000000; +static const int kCpuHasAVXVNNIINT8 = 0x4000000; +static const int kCpuHasAMXINT8 = 0x8000000; + +// These flags are only valid on MIPS processors. +static const int kCpuHasMIPS = 0x20; +static const int kCpuHasMSA = 0x100; + +// These flags are only valid on LOONGARCH processors. +static const int kCpuHasLOONGARCH = 0x40; +static const int kCpuHasLSX = 0x100; +static const int kCpuHasLASX = 0x200; + // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 673ec20d2..70f89134c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -355,6 +355,7 @@ extern "C" { // TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) +#define HAS_COPYROW_AVX512BW #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW @@ -3406,6 +3407,7 @@ void Convert16To8Row_Any_NEON(const uint16_t* src_ptr, void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); +void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); @@ -3413,6 +3415,7 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void CopyRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 23d3ad67a..1e01bac5a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1897 +#define LIBYUV_VERSION 1898 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/cpu_id.cc b/source/cpu_id.cc index cd0112af4..e4d59a052 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -419,7 +419,8 @@ static SAFEBUFFERS int GetCpuFlags(void) { ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | - ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); + ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info7[3] & 0x00000010) ? kCpuHasFSMR : 0); // AVX requires OS saves YMM registers. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave @@ -432,14 +433,14 @@ static SAFEBUFFERS int GetCpuFlags(void) { // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { - cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; - cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; - cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; - cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; - cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; - cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; - cpu_info |= (cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0; - cpu_info |= (cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0; + cpu_info |= ((cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0) | + ((cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0) | + ((cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0) | + ((cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0) | + ((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) | + ((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) | + ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) | + ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0); } } #endif diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 87f0ff2a0..be67a1ded 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -62,6 +62,11 @@ void CopyPlane(const uint8_t* src_y, CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif +#if defined(HAS_COPYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + CopyRow = IS_ALIGNED(width, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/rotate.cc b/source/rotate.cc index 08ec2ccfb..6b0b84f59 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -234,6 +234,11 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif +#if defined(HAS_COPYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + CopyRow = IS_ALIGNED(width, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index d55fac4f6..7cfaedc52 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -189,6 +189,11 @@ static int ARGBRotate180(const uint8_t* src_argb, CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif +#if defined(HAS_COPYROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; + } +#endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; diff --git a/source/row_any.cc b/source/row_any.cc index cd8b03f83..a61ab817c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -967,6 +967,9 @@ ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) memcpy(dst_ptr + n * BPP, vout, r * BPP); \ } +#ifdef HAS_COPYROW_AVX512BW +ANY11(CopyRow_Any_AVX512BW, CopyRow_AVX512BW, 0, 1, 1, 127) +#endif #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 1b72ab524..2ec59759f 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3361,7 +3361,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ "vpermq $0xd8,%%zmm3,%%zmm3 \n" \ "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ - "vmovdqu8 (%[y_buf]),%%ymm4 \n" \ + "vmovups (%[y_buf]),%%ymm4 \n" \ "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ "vpermq $0xd8,%%zmm4,%%zmm4 \n" \ "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ @@ -3572,17 +3572,17 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpbroadcastq %%xmm8, %%zmm8 \n" \ "vpsllw $7,%%xmm13,%%xmm13 \n" \ "vpbroadcastb %%xmm13,%%zmm13 \n" \ - "movq 32(%[yuvconstants]),%%xmm9 \n" \ + "movq 32(%[yuvconstants]),%%xmm9 \n" \ "vpbroadcastq %%xmm9,%%zmm9 \n" \ - "movq 64(%[yuvconstants]),%%xmm10 \n" \ + "movq 64(%[yuvconstants]),%%xmm10 \n" \ "vpbroadcastq %%xmm10,%%zmm10 \n" \ - "movq 96(%[yuvconstants]),%%xmm11 \n" \ + "movq 96(%[yuvconstants]),%%xmm11 \n" \ "vpbroadcastq %%xmm11,%%zmm11 \n" \ - "movq 128(%[yuvconstants]),%%xmm12 \n" \ + "movq 128(%[yuvconstants]),%%xmm12 \n" \ "vpbroadcastq %%xmm12,%%zmm12 \n" \ - "vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \ - "vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \ - "vmovdqu8 (%[unperm]),%%zmm18 \n" + "vmovups (%[quadsplitperm]),%%zmm16 \n" \ + "vmovups (%[dquadsplitperm]),%%zmm17 \n" \ + "vmovups (%[unperm]),%%zmm18 \n" #define YUVTORGB16_AVX2(yuvconstants) \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ @@ -3672,8 +3672,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ - "vmovdqu8 %%zmm1,(%[dst_argb]) \n" \ - "vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \ + "vmovups %%zmm1,(%[dst_argb]) \n" \ + "vmovups %%zmm0,0x40(%[dst_argb]) \n" \ "lea 0x80(%[dst_argb]), %[dst_argb] \n" // Store 16 AR30 values. @@ -5340,15 +5340,15 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, // 64 pixels per loop. LABELALIGN "1: \n" - "vmovdqu8 (%0),%%zmm0 \n" - "vmovdqu8 0x40(%0),%%zmm1 \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" "add $0x80,%0 \n" "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" "vpmovuswb %%zmm0,%%ymm0 \n" "vpmovuswb %%zmm1,%%ymm1 \n" - "vmovdqu8 %%ymm0,(%1) \n" - "vmovdqu8 %%ymm1,0x20(%1) \n" + "vmovups %%ymm0,(%1) \n" + "vmovups %%ymm1,0x20(%1) \n" "add $0x40,%1 \n" "sub $0x40,%2 \n" "jg 1b \n" @@ -5504,17 +5504,20 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, #endif // HAS_SPLITRGBROW_SSSE3 #ifdef HAS_SPLITRGBROW_SSE41 -// Shuffle table for converting RGB to Planar, SSE4.1. -alignas(16) static const uvec8 kSplitRGBShuffleSSE41[4] = { - {1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u}, +// Shuffle table for converting RGB to Planar, SSE4.1. Note: these are used for +// the AVX2 implementation as well. +static const uvec8 kSplitRGBShuffleSSE41[5] = { {0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u}, {1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u}, - {2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u}}; + {2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u}, + {0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u}, + {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u}, +}; void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( - "movdqa 0(%5), %%xmm0 \n" + "movdqa 48(%5), %%xmm0 \n" "1: \n" "movdqu (%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -5524,14 +5527,14 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, "pblendvb %%xmm3, %%xmm1 \n" "pblendvb %%xmm2, %%xmm3 \n" "pblendvb %%xmm4, %%xmm2 \n" - "psrlq $0x1, %%xmm0 \n" + "palignr $0xF, %%xmm0, %%xmm0 \n" "pblendvb %%xmm2, %%xmm1 \n" "pblendvb %%xmm3, %%xmm2 \n" "pblendvb %%xmm4, %%xmm3 \n" - "psllq $0x1, %%xmm0 \n" - "pshufb 16(%5), %%xmm1 \n" - "pshufb 32(%5), %%xmm2 \n" - "pshufb 48(%5), %%xmm3 \n" + "palignr $0x1, %%xmm0, %%xmm0 \n" + "pshufb 0(%5), %%xmm1 \n" + "pshufb 16(%5), %%xmm2 \n" + "pshufb 32(%5), %%xmm3 \n" "movdqu %%xmm1,(%1) \n" "lea 0x10(%1),%1 \n" "movdqu %%xmm2,(%2) \n" @@ -5540,11 +5543,11 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, "lea 0x10(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_r), // %1 - "+r"(dst_g), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 + : "+r"(src_rgb), // %0 + "+r"(dst_r), // %1 + "+r"(dst_g), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 : "r"(&kSplitRGBShuffleSSE41[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } @@ -5554,8 +5557,13 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( - "vbroadcasti128 0(%5), %%ymm0 \n" - "vpsrlq $0x1,%%ymm0,%%ymm7 \n" + "vbroadcasti128 48(%5), %%ymm0 \n" + "vbroadcasti128 64(%5), %%ymm7 \n" +#if defined(__x86_64__) + "vbroadcasti128 0(%5), %%ymm8 \n" + "vbroadcasti128 16(%5), %%ymm9 \n" + "vbroadcasti128 32(%5), %%ymm10 \n" +#endif "1: \n" "vmovdqu (%0),%%ymm4 \n" "vmovdqu 0x20(%0),%%ymm5 \n" @@ -5570,12 +5578,18 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, "vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n" "vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n" "vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n" - "vbroadcasti128 16(%5), %%ymm4 \n" - "vbroadcasti128 32(%5), %%ymm5 \n" - "vbroadcasti128 48(%5), %%ymm6 \n" +#if defined(__x86_64__) + "vpshufb %%ymm8, %%ymm1, %%ymm1 \n" + "vpshufb %%ymm9, %%ymm2, %%ymm2 \n" + "vpshufb %%ymm10, %%ymm3, %%ymm3 \n" +#else + "vbroadcasti128 0(%5), %%ymm4 \n" + "vbroadcasti128 16(%5), %%ymm5 \n" + "vbroadcasti128 32(%5), %%ymm6 \n" "vpshufb %%ymm4, %%ymm1, %%ymm1 \n" "vpshufb %%ymm5, %%ymm2, %%ymm2 \n" "vpshufb %%ymm6, %%ymm3, %%ymm3 \n" +#endif "vmovdqu %%ymm1,(%1) \n" "lea 0x20(%1),%1 \n" "vmovdqu %%ymm2,(%2) \n" @@ -5591,7 +5605,11 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, "+r"(width) // %4 : "r"(&kSplitRGBShuffleSSE41[0]) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + "xmm7" +#if defined(__x86_64__) + , "xmm8", "xmm9", "xmm10" +#endif + ); } #endif // HAS_SPLITRGBROW_AVX2 @@ -6487,6 +6505,27 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_COPYROW_AVX +#ifdef HAS_COPYROW_AVX512BW +void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { + asm volatile ( + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "lea 0x80(%0),%0 \n" + "vmovups %%zmm0,(%1) \n" + "vmovups %%zmm1,0x40(%1) \n" + "lea 0x80(%1),%1 \n" + "sub $0x80,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif // HAS_COPYROW_AVX512 + #ifdef HAS_COPYROW_ERMS // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { diff --git a/source/scale_sme.cc b/source/scale_sme.cc index f4166a5b1..fd364b316 100644 --- a/source/scale_sme.cc +++ b/source/scale_sme.cc @@ -15,6 +15,7 @@ namespace libyuv { extern "C" { #endif + #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 25ec85760..427614420 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -24,6 +24,48 @@ namespace libyuv { +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) +TEST_F(LibYUVBaseTest, TestCpuId) { + int has_x86 = TestCpuFlag(kCpuHasX86); + if (has_x86) { + int cpu_info[4]; + // Vendor ID: + // AuthenticAMD AMD processor + // CentaurHauls Centaur processor + // CyrixInstead Cyrix processor + // GenuineIntel Intel processor + // GenuineTMx86 Transmeta processor + // Geode by NSC National Semiconductor processor + // NexGenDriven NexGen processor + // RiseRiseRise Rise Technology processor + // SiS SiS SiS SiS processor + // UMC UMC UMC UMC processor + CpuId(0, 0, cpu_info); + cpu_info[0] = cpu_info[1]; // Reorder output + cpu_info[1] = cpu_info[3]; + cpu_info[3] = 0; + printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n", + reinterpret_cast(&cpu_info[0]), cpu_info[0], cpu_info[1], + cpu_info[2]); + EXPECT_EQ(12u, strlen(reinterpret_cast(&cpu_info[0]))); + + // CPU Family and Model + // 3:0 - Stepping + // 7:4 - Model + // 11:8 - Family + // 13:12 - Processor Type + // 19:16 - Extended Model + // 27:20 - Extended Family + CpuId(1, 0, cpu_info); + int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); + int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); + printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, + model); + } +} +#endif + #ifdef __linux__ static void KernelVersion(int *version) { struct utsname buffer; @@ -131,37 +173,6 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { defined(_M_IX86) || defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { - int family, model, cpu_info[4]; - // Vendor ID: - // AuthenticAMD AMD processor - // CentaurHauls Centaur processor - // CyrixInstead Cyrix processor - // GenuineIntel Intel processor - // GenuineTMx86 Transmeta processor - // Geode by NSC National Semiconductor processor - // NexGenDriven NexGen processor - // RiseRiseRise Rise Technology processor - // SiS SiS SiS SiS processor - // UMC UMC UMC UMC processor - CpuId(0, 0, &cpu_info[0]); - cpu_info[0] = cpu_info[1]; // Reorder output - cpu_info[1] = cpu_info[3]; - cpu_info[3] = 0; - printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0])); - - // CPU Family and Model - // 3:0 - Stepping - // 7:4 - Model - // 11:8 - Family - // 13:12 - Processor Type - // 19:16 - Extended Model - // 27:20 - Extended Family - CpuId(1, 0, &cpu_info[0]); - family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); - model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); - printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, - model, model); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse41 = TestCpuFlag(kCpuHasSSE41); @@ -169,6 +180,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); + int has_fsmr = TestCpuFlag(kCpuHasFSMR); int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_f16c = TestCpuFlag(kCpuHasF16C); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); @@ -189,6 +201,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { printf("Has AVX 0x%x\n", has_avx); printf("Has AVX2 0x%x\n", has_avx2); printf("Has ERMS 0x%x\n", has_erms); + printf("Has FSMR 0x%x\n", has_fsmr); printf("Has FMA3 0x%x\n", has_fma3); printf("Has F16C 0x%x\n", has_f16c); printf("Has AVX512BW 0x%x\n", has_avx512bw); @@ -315,48 +328,6 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) { #endif } -#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ - defined(_M_X64) -TEST_F(LibYUVBaseTest, TestCpuId) { - int has_x86 = TestCpuFlag(kCpuHasX86); - if (has_x86) { - int cpu_info[4]; - // Vendor ID: - // AuthenticAMD AMD processor - // CentaurHauls Centaur processor - // CyrixInstead Cyrix processor - // GenuineIntel Intel processor - // GenuineTMx86 Transmeta processor - // Geode by NSC National Semiconductor processor - // NexGenDriven NexGen processor - // RiseRiseRise Rise Technology processor - // SiS SiS SiS SiS processor - // UMC UMC UMC UMC processor - CpuId(0, 0, cpu_info); - cpu_info[0] = cpu_info[1]; // Reorder output - cpu_info[1] = cpu_info[3]; - cpu_info[3] = 0; - printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n", - reinterpret_cast(&cpu_info[0]), cpu_info[0], cpu_info[1], - cpu_info[2]); - EXPECT_EQ(12u, strlen(reinterpret_cast(&cpu_info[0]))); - - // CPU Family and Model - // 3:0 - Stepping - // 7:4 - Model - // 11:8 - Family - // 13:12 - Processor Type - // 19:16 - Extended Model - // 27:20 - Extended Family - CpuId(1, 0, cpu_info); - int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); - int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); - printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, - model); - } -} -#endif - static int FileExists(const char* file_name) { FILE* f = fopen(file_name, "r"); if (!f) { diff --git a/util/cpuid.c b/util/cpuid.c index 93d95673d..766c43c19 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -171,6 +171,7 @@ int main(int argc, const char* argv[]) { int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); + int has_fsmr = TestCpuFlag(kCpuHasFSMR); int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_f16c = TestCpuFlag(kCpuHasF16C); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);