From e35422d94bda549997db4fc0f9b6836b325e1888 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 21 Oct 2013 18:10:59 +0000 Subject: [PATCH] Fix AVX2 detect and a performance stall for gcc/clang. BUG=276 TEST=Cpu unittest R=nfullagar@google.com, ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2401004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@817 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/cpu_id.h | 4 +- include/libyuv/version.h | 2 +- source/cpu_id.cc | 104 ++++++++++++++------------------------- unit_test/cpu_test.cc | 6 +-- util/cpuid.c | 6 +-- 6 files changed, 48 insertions(+), 76 deletions(-) diff --git a/README.chromium b/README.chromium index d162d537e..3e6a3bdaa 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 814 +Version: 817 License: BSD License File: LICENSE diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 53adca474..79da994c7 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -68,8 +68,10 @@ LIBYUV_API void MaskCpuFlags(int enable_flags); // Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. LIBYUV_API -void CpuId(int cpu_info[4], int info_type); +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index fed1414eb..8e9096da5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 814 +#define LIBYUV_VERSION 817 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 46f4bd1f1..fb15f2084 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -11,7 +11,7 @@ #include "libyuv/cpu_id.h" #ifdef _MSC_VER -#include // For __cpuid() +#include // For __cpuidex() #endif #if !defined(__CLR_VER) && !defined(__native_client__) && defined(_M_X64) && \ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) @@ -28,28 +28,6 @@ #include "libyuv/basic_types.h" // For CPU_X86 -// TODO(fbarchard): Consider cpu functionality for breakpoints, timer and cache. -// arm - bkpt vs intel int 3 - -// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); -} -#elif defined(__i386__) || defined(__x86_64__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); -} -#endif - #ifdef __cplusplus namespace libyuv { extern "C" { @@ -59,52 +37,44 @@ extern "C" { #if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \ defined(__i386__) || defined(__x86_64__)) LIBYUV_API -void CpuId(int cpu_info[4], int info_type) { - __cpuid(cpu_info, info_type); +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { +#if defined(_MSC_VER) + __cpuidex(reinterpret_cast(cpu_info), eax, ecx); +#else + uint32 ebx, edx; + asm volatile ( // NOLINT +#if defined( __i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D" (ebx), +#else + "cpuid \n" + : "+b" (ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a" (eax), "+c" (ecx), "=d" (edx)); + cpu_info[0] = eax; cpu_info[1] = ebx; cpu_info[2] = ecx; cpu_info[3] = edx; +#endif // defined(_MSC_VER) +} +#define HAS_XGETBV +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int TestOsSaveYmm() { + uint32 xcr0; +#if defined(_MSC_VER) + xcr0 = (uint32)_xgetbv(0); /* min VS2010 SP1 compiler is required */ +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + return((xcr0 & 6) == 6); // Is ymm saved? } #else LIBYUV_API -void CpuId(int cpu_info[4], int) { +void CpuId(uint32, uint32, uint32* abcd) { cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } #endif -// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -#if !defined(__CLR_VER) && !defined(__native_client__) -#if defined(_M_X64) && defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) -#define HAS_XGETBV -static uint32 XGetBV(unsigned int xcr) { - return static_cast(_xgetbv(xcr)); -} -#elif !defined(__CLR_VER) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_XGETBV -__declspec(naked) __declspec(align(16)) -static uint32 XGetBV(unsigned int xcr) { - __asm { - mov ecx, [esp + 4] // xcr - push edx - _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005. - pop edx - ret - } -} -#elif defined(__i386__) || defined(__x86_64__) -#define HAS_XGETBV -static uint32 XGetBV(unsigned int xcr) { - uint32 xcr_feature_mask; - asm volatile ( // NOLINT - ".byte 0x0f, 0x01, 0xd0\n" - : "=a"(xcr_feature_mask) - : "c"(xcr) - : "memory", "cc", "edx"); // edx unused. - return xcr_feature_mask; -} -#endif -#endif // !defined(__CLR_VER) && !defined(__native_client__) -#ifdef HAS_XGETBV -static const int kXCR_XFEATURE_ENABLED_MASK = 0; -#endif - // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU LIBYUV_API @@ -170,10 +140,10 @@ static bool TestEnv(const char*) { LIBYUV_API int InitCpuFlags(void) { #if !defined(__CLR_VER) && defined(CPU_X86) - int cpu_info1[4] = { 0, 0, 0, 0 }; - int cpu_info7[4] = { 0, 0, 0, 0 }; - __cpuid(cpu_info1, 1); - __cpuid(cpu_info7, 7); + uint32 cpu_info1[4] = { 0, 0, 0, 0 }; + uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(1, 0, cpu_info1); + CpuId(7, 0, cpu_info7); cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | @@ -183,7 +153,7 @@ int InitCpuFlags(void) { kCpuHasX86; #ifdef HAS_XGETBV if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave - (XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06) { // Saves YMM. + TestOsSaveYmm()) { // Saves YMM. cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; } diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index bcdc17fbe..45579b891 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -56,7 +56,7 @@ TEST_F(libyuvTest, TestCpuHas) { TEST_F(libyuvTest, TestCpuId) { int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { - int cpu_info[4]; + uint32 cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor @@ -68,7 +68,7 @@ TEST_F(libyuvTest, TestCpuId) { // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor - CpuId(cpu_info, 0); + CpuId(0, 0, cpu_info); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; @@ -83,7 +83,7 @@ TEST_F(libyuvTest, TestCpuId) { // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family - CpuId(cpu_info, 1); + CpuId(1, 0, cpu_info); int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, diff --git a/util/cpuid.c b/util/cpuid.c index 79c821fd3..db22871ea 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -25,7 +25,7 @@ int main(int argc, const char* argv[]) { #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) if (has_x86) { - int family, model, cpu_info[4]; + uint32 family, model, cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor @@ -37,7 +37,7 @@ int main(int argc, const char* argv[]) { // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor - CpuId(cpu_info, 0); + CpuId(0, 0, &cpu_info[0]); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; @@ -50,7 +50,7 @@ int main(int argc, const char* argv[]) { // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family - CpuId(cpu_info, 1); + CpuId(1, 0, &cpu_info[0]); family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,