diff --git a/README.chromium b/README.chromium index 41f5f44a6..b2c02eadc 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 789 +Version: 790 License: BSD License File: LICENSE diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 8b6d04322..53adca474 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -18,6 +18,7 @@ namespace libyuv { extern "C" { #endif +// TODO(fbarchard): Consider overlapping bits for different architectures. // Internal flag to indicate cpuid requires initialization. static const int kCpuInit = 0x1; @@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100; static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. // These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x1000; -static const int kCpuHasMIPS_DSP = 0x2000; -static const int kCpuHasMIPS_DSPR2 = 0x4000; +static const int kCpuHasMIPS = 0x10000; +static const int kCpuHasMIPS_DSP = 0x20000; +static const int kCpuHasMIPS_DSPR2 = 0x40000; // Internal function used to auto-init. LIBYUV_API diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 725e326fd..12655dc69 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 789 +#define LIBYUV_VERSION 790 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 1e780f02f..46f4bd1f1 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -179,6 +179,7 @@ int InitCpuFlags(void) { ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | kCpuHasX86; #ifdef HAS_XGETBV if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave @@ -212,6 +213,9 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_ERMS")) { cpu_info_ &= ~kCpuHasERMS; } + if (TestEnv("LIBYUV_DISABLE_FMA3")) { + cpu_info_ &= ~kCpuHasFMA3; + } #elif defined(__mips__) && defined(__linux__) // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 606b199a5..e4a255194 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2057,7 +2057,8 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBPOLYNOMIALROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) { + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { ARGBPolynomialRow = ARGBPolynomialRow_AVX2; } #endif diff --git a/source/row_win.cc b/source/row_win.cc index 21102a600..273c35066 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6834,10 +6834,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, int width) { __asm { mov eax, [esp + 12] /* poly */ - vmovdqu xmm4, [eax] - vmovdqu xmm5, [eax + 16] - vmovdqu xmm6, [eax + 32] - vmovdqu xmm7, [eax + 48] + vmovdqu xmm4, [eax] // C0 + vmovdqu xmm5, [eax + 16] // C1 + vmovdqu xmm6, [eax + 32] // C2 + vmovdqu xmm7, [eax + 48] // C3 vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords vpermq ymm5, ymm5, 0x44 vpermq ymm6, ymm6, 0x44 @@ -6850,25 +6850,22 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, // 2 pixel loop. align 16 convertloop: - vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels - lea eax, [eax + 8] - vcvtdq2ps ymm0, ymm0 // X 8 floats - vmulps ymm2, ymm0, ymm0 // X * X - vmulps ymm3, ymm0, ymm7 // C3 * X - vmulps ymm1, ymm0, ymm5 // C1 * X - vmulps ymm3, ymm2, ymm3 // C3 * X * X * X - vmulps ymm2, ymm2, ymm6 // C2 * X * X - vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X - vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X - vaddps ymm1, ymm1, ymm2 // result += C2 * X * X - vcvttps2dq ymm1, ymm1 - vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000 - vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 - vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000 - sub ecx, 2 - vmovq qword ptr [edx], xmm1 - lea edx, [edx + 8] - jg convertloop + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + sub ecx, 2 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop vzeroupper ret } diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 67c489cfc..bcdc17fbe 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) { printf("Has AVX2 %x\n", has_avx2); int has_erms = TestCpuFlag(kCpuHasERMS); printf("Has ERMS %x\n", has_erms); + int has_fma3 = TestCpuFlag(kCpuHasFMA3); + printf("Has FMA3 %x\n", has_fma3); int has_mips = TestCpuFlag(kCpuHasMIPS); printf("Has MIPS %x\n", has_mips); int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP); @@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) { TEST_F(libyuvTest, TestLinuxNeon) { int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt"); if (testdata) { - EXPECT_EQ(0, - ArmCpuCaps("unit_test/testdata/arm_v7.txt")); - EXPECT_EQ(kCpuHasNEON, - ArmCpuCaps("unit_test/testdata/tegra3.txt")); + EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt")); + EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt")); } else { printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n"); } diff --git a/util/cpuid.c b/util/cpuid.c index 8d8529ba7..79c821fd3 100644 --- a/util/cpuid.c +++ b/util/cpuid.c @@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) { int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); + int has_fma3 = TestCpuFlag(kCpuHasFMA3); printf("Has SSE2 %x\n", has_sse2); printf("Has SSSE3 %x\n", has_ssse3); printf("Has SSE4.1 %x\n", has_sse41); @@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) { printf("Has AVX %x\n", has_avx); printf("Has AVX2 %x\n", has_avx2); printf("Has ERMS %x\n", has_erms); + printf("Has FMA3 %x\n", has_fma3); } return 0; }