CpuId test FSMR - Fast Short Rep Movsb

- Renumber cpuid bits to use low byte to ID the type of CPU and upper 24 bits for features

Intel CPUs starting at Icelake support FSMR
adl:Has FSMR 0x8000
arl:Has FSMR 0x0
bdw:Has FSMR 0x0
clx:Has FSMR 0x0
cnl:Has FSMR 0x0
cpx:Has FSMR 0x0
emr:Has FSMR 0x8000
glm:Has FSMR 0x0
glp:Has FSMR 0x0
gnr:Has FSMR 0x8000
gnr256:Has FSMR 0x8000
hsw:Has FSMR 0x0
icl:Has FSMR 0x8000
icx:Has FSMR 0x8000
ivb:Has FSMR 0x0
knl:Has FSMR 0x0
knm:Has FSMR 0x0
lnl:Has FSMR 0x8000
mrm:Has FSMR 0x0
mtl:Has FSMR 0x8000
nhm:Has FSMR 0x0
pnr:Has FSMR 0x0
rpl:Has FSMR 0x8000
skl:Has FSMR 0x0
skx:Has FSMR 0x0
slm:Has FSMR 0x0
slt:Has FSMR 0x0
snb:Has FSMR 0x0
snr:Has FSMR 0x0
spr:Has FSMR 0x8000
srf:Has FSMR 0x0
tgl:Has FSMR 0x8000
tnt:Has FSMR 0x0
wsm:Has FSMR 0x0

Intel CPUs starting at Ivybridge support ERMS

adl:Has ERMS 0x4000
arl:Has ERMS 0x4000
bdw:Has ERMS 0x4000
clx:Has ERMS 0x4000
cnl:Has ERMS 0x4000
cpx:Has ERMS 0x4000
emr:Has ERMS 0x4000
glm:Has ERMS 0x4000
glp:Has ERMS 0x4000
gnr:Has ERMS 0x4000
gnr256:Has ERMS 0x4000
hsw:Has ERMS 0x4000
icl:Has ERMS 0x4000
icx:Has ERMS 0x4000
ivb:Has ERMS 0x4000
knl:Has ERMS 0x4000
knm:Has ERMS 0x4000
lnl:Has ERMS 0x4000
mrm:Has ERMS 0x0
mtl:Has ERMS 0x4000
nhm:Has ERMS 0x0
pnr:Has ERMS 0x0
rpl:Has ERMS 0x4000
skl:Has ERMS 0x4000
skx:Has ERMS 0x4000
slm:Has ERMS 0x4000
slt:Has ERMS 0x0
snb:Has ERMS 0x0
snr:Has ERMS 0x4000
spr:Has ERMS 0x4000
srf:Has ERMS 0x4000
tgl:Has ERMS 0x4000
tnt:Has ERMS 0x4000
wsm:Has ERMS 0x0
Change-Id: I18e5a3905f2691ab66d4d0cb6f668c0a0ff72d37
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6027541
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2024-11-16 15:40:17 -08:00
parent 75f7cfdde5
commit 1c501a8f3f
13 changed files with 195 additions and 160 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1897 Version: 1898
License: BSD License: BSD
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -23,49 +23,50 @@ static const int kCpuInitialized = 0x1;
// These flags are only valid on Arm processors. // These flags are only valid on Arm processors.
static const int kCpuHasARM = 0x2; static const int kCpuHasARM = 0x2;
static const int kCpuHasNEON = 0x4; static const int kCpuHasNEON = 0x100;
// Leave a gap to avoid setting kCpuHasX86. static const int kCpuHasNeonDotProd = 0x200;
static const int kCpuHasNeonDotProd = 0x10; static const int kCpuHasNeonI8MM = 0x400;
static const int kCpuHasNeonI8MM = 0x20; static const int kCpuHasSVE = 0x800;
static const int kCpuHasSVE = 0x40; static const int kCpuHasSVE2 = 0x1000;
static const int kCpuHasSVE2 = 0x80; static const int kCpuHasSME = 0x2000;
static const int kCpuHasSME = 0x100;
// These flags are only valid on x86 processors.
static const int kCpuHasX86 = 0x8;
static const int kCpuHasSSE2 = 0x10;
static const int kCpuHasSSSE3 = 0x20;
static const int kCpuHasSSE41 = 0x40;
static const int kCpuHasSSE42 = 0x80;
static const int kCpuHasAVX = 0x100;
static const int kCpuHasAVX2 = 0x200;
static const int kCpuHasERMS = 0x400;
static const int kCpuHasFMA3 = 0x800;
static const int kCpuHasF16C = 0x1000;
static const int kCpuHasAVX512BW = 0x2000;
static const int kCpuHasAVX512VL = 0x4000;
static const int kCpuHasAVX512VNNI = 0x8000;
static const int kCpuHasAVX512VBMI = 0x10000;
static const int kCpuHasAVX512VBMI2 = 0x20000;
static const int kCpuHasAVX512VBITALG = 0x40000;
static const int kCpuHasAVX10 = 0x80000;
static const int kCpuHasAVXVNNI = 0x100000;
static const int kCpuHasAVXVNNIINT8 = 0x200000;
static const int kCpuHasAMXINT8 = 0x400000;
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x800000;
static const int kCpuHasMSA = 0x1000000;
// These flags are only valid on LOONGARCH processors.
static const int kCpuHasLOONGARCH = 0x2000000;
static const int kCpuHasLSX = 0x4000000;
static const int kCpuHasLASX = 0x8000000;
// These flags are only valid on RISCV processors. // These flags are only valid on RISCV processors.
static const int kCpuHasRISCV = 0x10000000; static const int kCpuHasRISCV = 0x8;
static const int kCpuHasRVV = 0x20000000; static const int kCpuHasRVV = 0x100;
static const int kCpuHasRVVZVFH = 0x40000000; static const int kCpuHasRVVZVFH = 0x200;
// These flags are only valid on x86 processors.
static const int kCpuHasX86 = 0x10;
static const int kCpuHasSSE2 = 0x100;
static const int kCpuHasSSSE3 = 0x200;
static const int kCpuHasSSE41 = 0x400;
static const int kCpuHasSSE42 = 0x800;
static const int kCpuHasAVX = 0x1000;
static const int kCpuHasAVX2 = 0x2000;
static const int kCpuHasERMS = 0x4000;
static const int kCpuHasFSMR = 0x8000;
static const int kCpuHasFMA3 = 0x10000;
static const int kCpuHasF16C = 0x20000;
static const int kCpuHasAVX512BW = 0x40000;
static const int kCpuHasAVX512VL = 0x80000;
static const int kCpuHasAVX512VNNI = 0x100000;
static const int kCpuHasAVX512VBMI = 0x200000;
static const int kCpuHasAVX512VBMI2 = 0x400000;
static const int kCpuHasAVX512VBITALG = 0x800000;
static const int kCpuHasAVX10 = 0x1000000;
static const int kCpuHasAVXVNNI = 0x2000000;
static const int kCpuHasAVXVNNIINT8 = 0x4000000;
static const int kCpuHasAMXINT8 = 0x8000000;
// These flags are only valid on MIPS processors.
static const int kCpuHasMIPS = 0x20;
static const int kCpuHasMSA = 0x100;
// These flags are only valid on LOONGARCH processors.
static const int kCpuHasLOONGARCH = 0x40;
static const int kCpuHasLSX = 0x100;
static const int kCpuHasLASX = 0x200;
// Optional init function. TestCpuFlag does an auto-init. // Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags. // Returns cpu_info flags.

View File

@ -355,6 +355,7 @@ extern "C" {
// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. // TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
#define HAS_COPYROW_AVX512BW
#define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_CONVERT16TO8ROW_AVX512BW #define HAS_CONVERT16TO8ROW_AVX512BW
#define HAS_MERGEUVROW_AVX512BW #define HAS_MERGEUVROW_AVX512BW
@ -3406,6 +3407,7 @@ void Convert16To8Row_Any_NEON(const uint16_t* src_ptr,
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
@ -3413,6 +3415,7 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count);
void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void CopyRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1897 #define LIBYUV_VERSION 1898
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -419,7 +419,8 @@ static SAFEBUFFERS int GetCpuFlags(void) {
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info7[3] & 0x00000010) ? kCpuHasFSMR : 0);
// AVX requires OS saves YMM registers. // AVX requires OS saves YMM registers.
if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave
@ -432,14 +433,14 @@ static SAFEBUFFERS int GetCpuFlags(void) {
// Detect AVX512bw // Detect AVX512bw
if ((GetXCR0() & 0xe0) == 0xe0) { if ((GetXCR0() & 0xe0) == 0xe0) {
cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; cpu_info |= ((cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0) |
cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; ((cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0) |
cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; ((cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0) |
cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; ((cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0) |
cpu_info |= (cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0; ((cpu_info7[2] & 0x00000800) ? kCpuHasAVX512VNNI : 0) |
cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; ((cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0) |
cpu_info |= (cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0; ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
cpu_info |= (cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0; ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
} }
} }
#endif #endif

View File

@ -62,6 +62,11 @@ void CopyPlane(const uint8_t* src_y,
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;

View File

@ -234,6 +234,11 @@ void RotatePlane180(const uint8_t* src,
CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;

View File

@ -189,6 +189,11 @@ static int ARGBRotate180(const uint8_t* src_argb,
CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
} }
#endif #endif
#if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
}
#endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) { if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS; CopyRow = CopyRow_ERMS;

View File

@ -967,6 +967,9 @@ ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)
memcpy(dst_ptr + n * BPP, vout, r * BPP); \ memcpy(dst_ptr + n * BPP, vout, r * BPP); \
} }
#ifdef HAS_COPYROW_AVX512BW
ANY11(CopyRow_Any_AVX512BW, CopyRow_AVX512BW, 0, 1, 1, 127)
#endif
#ifdef HAS_COPYROW_AVX #ifdef HAS_COPYROW_AVX
ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
#endif #endif

View File

@ -3361,7 +3361,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \ "vpunpcklbw %%zmm1,%%zmm3,%%zmm3 \n" \
"vpermq $0xd8,%%zmm3,%%zmm3 \n" \ "vpermq $0xd8,%%zmm3,%%zmm3 \n" \
"vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \ "vpunpcklwd %%zmm3,%%zmm3,%%zmm3 \n" \
"vmovdqu8 (%[y_buf]),%%ymm4 \n" \ "vmovups (%[y_buf]),%%ymm4 \n" \
"vpermq %%zmm4,%%zmm17,%%zmm4 \n" \ "vpermq %%zmm4,%%zmm17,%%zmm4 \n" \
"vpermq $0xd8,%%zmm4,%%zmm4 \n" \ "vpermq $0xd8,%%zmm4,%%zmm4 \n" \
"vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \ "vpunpcklbw %%zmm4,%%zmm4,%%zmm4 \n" \
@ -3580,9 +3580,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpbroadcastq %%xmm11,%%zmm11 \n" \ "vpbroadcastq %%xmm11,%%zmm11 \n" \
"movq 128(%[yuvconstants]),%%xmm12 \n" \ "movq 128(%[yuvconstants]),%%xmm12 \n" \
"vpbroadcastq %%xmm12,%%zmm12 \n" \ "vpbroadcastq %%xmm12,%%zmm12 \n" \
"vmovdqu8 (%[quadsplitperm]),%%zmm16 \n" \ "vmovups (%[quadsplitperm]),%%zmm16 \n" \
"vmovdqu8 (%[dquadsplitperm]),%%zmm17 \n" \ "vmovups (%[dquadsplitperm]),%%zmm17 \n" \
"vmovdqu8 (%[unperm]),%%zmm18 \n" "vmovups (%[unperm]),%%zmm18 \n"
#define YUVTORGB16_AVX2(yuvconstants) \ #define YUVTORGB16_AVX2(yuvconstants) \
"vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \
@ -3672,8 +3672,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \
"vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \
"vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \
"vmovdqu8 %%zmm1,(%[dst_argb]) \n" \ "vmovups %%zmm1,(%[dst_argb]) \n" \
"vmovdqu8 %%zmm0,0x40(%[dst_argb]) \n" \ "vmovups %%zmm0,0x40(%[dst_argb]) \n" \
"lea 0x80(%[dst_argb]), %[dst_argb] \n" "lea 0x80(%[dst_argb]), %[dst_argb] \n"
// Store 16 AR30 values. // Store 16 AR30 values.
@ -5340,15 +5340,15 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y,
// 64 pixels per loop. // 64 pixels per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu8 (%0),%%zmm0 \n" "vmovups (%0),%%zmm0 \n"
"vmovdqu8 0x40(%0),%%zmm1 \n" "vmovups 0x40(%0),%%zmm1 \n"
"add $0x80,%0 \n" "add $0x80,%0 \n"
"vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n"
"vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n"
"vpmovuswb %%zmm0,%%ymm0 \n" "vpmovuswb %%zmm0,%%ymm0 \n"
"vpmovuswb %%zmm1,%%ymm1 \n" "vpmovuswb %%zmm1,%%ymm1 \n"
"vmovdqu8 %%ymm0,(%1) \n" "vmovups %%ymm0,(%1) \n"
"vmovdqu8 %%ymm1,0x20(%1) \n" "vmovups %%ymm1,0x20(%1) \n"
"add $0x40,%1 \n" "add $0x40,%1 \n"
"sub $0x40,%2 \n" "sub $0x40,%2 \n"
"jg 1b \n" "jg 1b \n"
@ -5504,17 +5504,20 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
#endif // HAS_SPLITRGBROW_SSSE3 #endif // HAS_SPLITRGBROW_SSSE3
#ifdef HAS_SPLITRGBROW_SSE41 #ifdef HAS_SPLITRGBROW_SSE41
// Shuffle table for converting RGB to Planar, SSE4.1. // Shuffle table for converting RGB to Planar, SSE4.1. Note: these are used for
alignas(16) static const uvec8 kSplitRGBShuffleSSE41[4] = { // the AVX2 implementation as well.
{1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u, 128u, 0u, 1u}, static const uvec8 kSplitRGBShuffleSSE41[5] = {
{0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u}, {0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u},
{1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u}, {1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u, 2u, 5u, 8u, 11u, 14u},
{2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u}}; {2u, 5u, 8u, 11u, 14u, 1u, 4u, 7u, 10u, 13u, 0u, 3u, 6u, 9u, 12u, 15u},
{0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u},
{0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u},
};
void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_b, int width) { uint8_t* dst_g, uint8_t* dst_b, int width) {
asm volatile( asm volatile(
"movdqa 0(%5), %%xmm0 \n" "movdqa 48(%5), %%xmm0 \n"
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
@ -5524,14 +5527,14 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
"pblendvb %%xmm3, %%xmm1 \n" "pblendvb %%xmm3, %%xmm1 \n"
"pblendvb %%xmm2, %%xmm3 \n" "pblendvb %%xmm2, %%xmm3 \n"
"pblendvb %%xmm4, %%xmm2 \n" "pblendvb %%xmm4, %%xmm2 \n"
"psrlq $0x1, %%xmm0 \n" "palignr $0xF, %%xmm0, %%xmm0 \n"
"pblendvb %%xmm2, %%xmm1 \n" "pblendvb %%xmm2, %%xmm1 \n"
"pblendvb %%xmm3, %%xmm2 \n" "pblendvb %%xmm3, %%xmm2 \n"
"pblendvb %%xmm4, %%xmm3 \n" "pblendvb %%xmm4, %%xmm3 \n"
"psllq $0x1, %%xmm0 \n" "palignr $0x1, %%xmm0, %%xmm0 \n"
"pshufb 16(%5), %%xmm1 \n" "pshufb 0(%5), %%xmm1 \n"
"pshufb 32(%5), %%xmm2 \n" "pshufb 16(%5), %%xmm2 \n"
"pshufb 48(%5), %%xmm3 \n" "pshufb 32(%5), %%xmm3 \n"
"movdqu %%xmm1,(%1) \n" "movdqu %%xmm1,(%1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"movdqu %%xmm2,(%2) \n" "movdqu %%xmm2,(%2) \n"
@ -5554,8 +5557,13 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r,
void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_b, int width) { uint8_t* dst_g, uint8_t* dst_b, int width) {
asm volatile( asm volatile(
"vbroadcasti128 0(%5), %%ymm0 \n" "vbroadcasti128 48(%5), %%ymm0 \n"
"vpsrlq $0x1,%%ymm0,%%ymm7 \n" "vbroadcasti128 64(%5), %%ymm7 \n"
#if defined(__x86_64__)
"vbroadcasti128 0(%5), %%ymm8 \n"
"vbroadcasti128 16(%5), %%ymm9 \n"
"vbroadcasti128 32(%5), %%ymm10 \n"
#endif
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm4 \n" "vmovdqu (%0),%%ymm4 \n"
"vmovdqu 0x20(%0),%%ymm5 \n" "vmovdqu 0x20(%0),%%ymm5 \n"
@ -5570,12 +5578,18 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
"vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n" "vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n"
"vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n" "vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n"
"vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n" "vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n"
"vbroadcasti128 16(%5), %%ymm4 \n" #if defined(__x86_64__)
"vbroadcasti128 32(%5), %%ymm5 \n" "vpshufb %%ymm8, %%ymm1, %%ymm1 \n"
"vbroadcasti128 48(%5), %%ymm6 \n" "vpshufb %%ymm9, %%ymm2, %%ymm2 \n"
"vpshufb %%ymm10, %%ymm3, %%ymm3 \n"
#else
"vbroadcasti128 0(%5), %%ymm4 \n"
"vbroadcasti128 16(%5), %%ymm5 \n"
"vbroadcasti128 32(%5), %%ymm6 \n"
"vpshufb %%ymm4, %%ymm1, %%ymm1 \n" "vpshufb %%ymm4, %%ymm1, %%ymm1 \n"
"vpshufb %%ymm5, %%ymm2, %%ymm2 \n" "vpshufb %%ymm5, %%ymm2, %%ymm2 \n"
"vpshufb %%ymm6, %%ymm3, %%ymm3 \n" "vpshufb %%ymm6, %%ymm3, %%ymm3 \n"
#endif
"vmovdqu %%ymm1,(%1) \n" "vmovdqu %%ymm1,(%1) \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"vmovdqu %%ymm2,(%2) \n" "vmovdqu %%ymm2,(%2) \n"
@ -5591,7 +5605,11 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r,
"+r"(width) // %4 "+r"(width) // %4
: "r"(&kSplitRGBShuffleSSE41[0]) // %5 : "r"(&kSplitRGBShuffleSSE41[0]) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7"); "xmm7"
#if defined(__x86_64__)
, "xmm8", "xmm9", "xmm10"
#endif
);
} }
#endif // HAS_SPLITRGBROW_AVX2 #endif // HAS_SPLITRGBROW_AVX2
@ -6487,6 +6505,27 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
} }
#endif // HAS_COPYROW_AVX #endif // HAS_COPYROW_AVX
#ifdef HAS_COPYROW_AVX512BW
void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
"1: \n"
"vmovups (%0),%%zmm0 \n"
"vmovups 0x40(%0),%%zmm1 \n"
"lea 0x80(%0),%0 \n"
"vmovups %%zmm0,(%1) \n"
"vmovups %%zmm1,0x40(%1) \n"
"lea 0x80(%1),%1 \n"
"sub $0x80,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
#endif // HAS_COPYROW_AVX512
#ifdef HAS_COPYROW_ERMS #ifdef HAS_COPYROW_ERMS
// Multiple of 1. // Multiple of 1.
void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {

View File

@ -15,6 +15,7 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__) defined(__aarch64__)

View File

@ -24,6 +24,48 @@
namespace libyuv { namespace libyuv {
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_X64)
TEST_F(LibYUVBaseTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
int cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
// CyrixInstead Cyrix processor
// GenuineIntel Intel processor
// GenuineTMx86 Transmeta processor
// Geode by NSC National Semiconductor processor
// NexGenDriven NexGen processor
// RiseRiseRise Rise Technology processor
// SiS SiS SiS SiS processor
// UMC UMC UMC UMC processor
CpuId(0, 0, cpu_info);
cpu_info[0] = cpu_info[1]; // Reorder output
cpu_info[1] = cpu_info[3];
cpu_info[3] = 0;
printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
cpu_info[2]);
EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
// CPU Family and Model
// 3:0 - Stepping
// 7:4 - Model
// 11:8 - Family
// 13:12 - Processor Type
// 19:16 - Extended Model
// 27:20 - Extended Family
CpuId(1, 0, cpu_info);
int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
model);
}
}
#endif
#ifdef __linux__ #ifdef __linux__
static void KernelVersion(int *version) { static void KernelVersion(int *version) {
struct utsname buffer; struct utsname buffer;
@ -131,37 +173,6 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
defined(_M_IX86) || defined(_M_X64) defined(_M_IX86) || defined(_M_X64)
int has_x86 = TestCpuFlag(kCpuHasX86); int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) { if (has_x86) {
int family, model, cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
// CyrixInstead Cyrix processor
// GenuineIntel Intel processor
// GenuineTMx86 Transmeta processor
// Geode by NSC National Semiconductor processor
// NexGenDriven NexGen processor
// RiseRiseRise Rise Technology processor
// SiS SiS SiS SiS processor
// UMC UMC UMC UMC processor
CpuId(0, 0, &cpu_info[0]);
cpu_info[0] = cpu_info[1]; // Reorder output
cpu_info[1] = cpu_info[3];
cpu_info[3] = 0;
printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
// CPU Family and Model
// 3:0 - Stepping
// 7:4 - Model
// 11:8 - Family
// 13:12 - Processor Type
// 19:16 - Extended Model
// 27:20 - Extended Family
CpuId(1, 0, &cpu_info[0]);
family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
model, model);
int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_sse2 = TestCpuFlag(kCpuHasSSE2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
int has_sse41 = TestCpuFlag(kCpuHasSSE41); int has_sse41 = TestCpuFlag(kCpuHasSSE41);
@ -169,6 +180,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS); int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fsmr = TestCpuFlag(kCpuHasFSMR);
int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_fma3 = TestCpuFlag(kCpuHasFMA3);
int has_f16c = TestCpuFlag(kCpuHasF16C); int has_f16c = TestCpuFlag(kCpuHasF16C);
int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);
@ -189,6 +201,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has AVX 0x%x\n", has_avx); printf("Has AVX 0x%x\n", has_avx);
printf("Has AVX2 0x%x\n", has_avx2); printf("Has AVX2 0x%x\n", has_avx2);
printf("Has ERMS 0x%x\n", has_erms); printf("Has ERMS 0x%x\n", has_erms);
printf("Has FSMR 0x%x\n", has_fsmr);
printf("Has FMA3 0x%x\n", has_fma3); printf("Has FMA3 0x%x\n", has_fma3);
printf("Has F16C 0x%x\n", has_f16c); printf("Has F16C 0x%x\n", has_f16c);
printf("Has AVX512BW 0x%x\n", has_avx512bw); printf("Has AVX512BW 0x%x\n", has_avx512bw);
@ -315,48 +328,6 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
#endif #endif
} }
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_X64)
TEST_F(LibYUVBaseTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
int cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
// CyrixInstead Cyrix processor
// GenuineIntel Intel processor
// GenuineTMx86 Transmeta processor
// Geode by NSC National Semiconductor processor
// NexGenDriven NexGen processor
// RiseRiseRise Rise Technology processor
// SiS SiS SiS SiS processor
// UMC UMC UMC UMC processor
CpuId(0, 0, cpu_info);
cpu_info[0] = cpu_info[1]; // Reorder output
cpu_info[1] = cpu_info[3];
cpu_info[3] = 0;
printf("Cpu Vendor: %s 0x%x 0x%x 0x%x\n",
reinterpret_cast<char*>(&cpu_info[0]), cpu_info[0], cpu_info[1],
cpu_info[2]);
EXPECT_EQ(12u, strlen(reinterpret_cast<char*>(&cpu_info[0])));
// CPU Family and Model
// 3:0 - Stepping
// 7:4 - Model
// 11:8 - Family
// 13:12 - Processor Type
// 19:16 - Extended Model
// 27:20 - Extended Family
CpuId(1, 0, cpu_info);
int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model,
model);
}
}
#endif
static int FileExists(const char* file_name) { static int FileExists(const char* file_name) {
FILE* f = fopen(file_name, "r"); FILE* f = fopen(file_name, "r");
if (!f) { if (!f) {

View File

@ -171,6 +171,7 @@ int main(int argc, const char* argv[]) {
int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS); int has_erms = TestCpuFlag(kCpuHasERMS);
int has_fsmr = TestCpuFlag(kCpuHasFSMR);
int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_fma3 = TestCpuFlag(kCpuHasFMA3);
int has_f16c = TestCpuFlag(kCpuHasF16C); int has_f16c = TestCpuFlag(kCpuHasF16C);
int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW);