mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-14 22:29:52 +08:00
FMA3 version of Polynomial
BUG=265 TEST=cpuid and Polynomial unittest R=changjun.yang@intel.com Review URL: https://webrtc-codereview.appspot.com/2217004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@790 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
65d1ba6a26
commit
2bbb64df2c
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 789
|
Version: 790
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -18,6 +18,7 @@ namespace libyuv {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// TODO(fbarchard): Consider overlapping bits for different architectures.
|
||||||
// Internal flag to indicate cpuid requires initialization.
|
// Internal flag to indicate cpuid requires initialization.
|
||||||
static const int kCpuInit = 0x1;
|
static const int kCpuInit = 0x1;
|
||||||
|
|
||||||
@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100;
|
|||||||
static const int kCpuHasAVX = 0x200;
|
static const int kCpuHasAVX = 0x200;
|
||||||
static const int kCpuHasAVX2 = 0x400;
|
static const int kCpuHasAVX2 = 0x400;
|
||||||
static const int kCpuHasERMS = 0x800;
|
static const int kCpuHasERMS = 0x800;
|
||||||
|
static const int kCpuHasFMA3 = 0x1000;
|
||||||
|
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
|
||||||
|
|
||||||
// These flags are only valid on MIPS processors.
|
// These flags are only valid on MIPS processors.
|
||||||
static const int kCpuHasMIPS = 0x1000;
|
static const int kCpuHasMIPS = 0x10000;
|
||||||
static const int kCpuHasMIPS_DSP = 0x2000;
|
static const int kCpuHasMIPS_DSP = 0x20000;
|
||||||
static const int kCpuHasMIPS_DSPR2 = 0x4000;
|
static const int kCpuHasMIPS_DSPR2 = 0x40000;
|
||||||
|
|
||||||
// Internal function used to auto-init.
|
// Internal function used to auto-init.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 789
|
#define LIBYUV_VERSION 790
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -179,6 +179,7 @@ int InitCpuFlags(void) {
|
|||||||
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
||||||
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
||||||
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
|
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
|
||||||
|
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
|
||||||
kCpuHasX86;
|
kCpuHasX86;
|
||||||
#ifdef HAS_XGETBV
|
#ifdef HAS_XGETBV
|
||||||
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
|
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
|
||||||
@ -212,6 +213,9 @@ int InitCpuFlags(void) {
|
|||||||
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
|
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
|
||||||
cpu_info_ &= ~kCpuHasERMS;
|
cpu_info_ &= ~kCpuHasERMS;
|
||||||
}
|
}
|
||||||
|
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
|
||||||
|
cpu_info_ &= ~kCpuHasFMA3;
|
||||||
|
}
|
||||||
#elif defined(__mips__) && defined(__linux__)
|
#elif defined(__mips__) && defined(__linux__)
|
||||||
// Linux mips parse text file for dsp detect.
|
// Linux mips parse text file for dsp detect.
|
||||||
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
|
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
|
||||||
|
|||||||
@ -2057,7 +2057,8 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
|
#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) {
|
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
|
||||||
|
IS_ALIGNED(width, 2)) {
|
||||||
ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
|
ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -6834,10 +6834,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 12] /* poly */
|
mov eax, [esp + 12] /* poly */
|
||||||
vmovdqu xmm4, [eax]
|
vmovdqu xmm4, [eax] // C0
|
||||||
vmovdqu xmm5, [eax + 16]
|
vmovdqu xmm5, [eax + 16] // C1
|
||||||
vmovdqu xmm6, [eax + 32]
|
vmovdqu xmm6, [eax + 32] // C2
|
||||||
vmovdqu xmm7, [eax + 48]
|
vmovdqu xmm7, [eax + 48] // C3
|
||||||
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
|
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
|
||||||
vpermq ymm5, ymm5, 0x44
|
vpermq ymm5, ymm5, 0x44
|
||||||
vpermq ymm6, ymm6, 0x44
|
vpermq ymm6, ymm6, 0x44
|
||||||
@ -6850,25 +6850,22 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||||||
// 2 pixel loop.
|
// 2 pixel loop.
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
||||||
lea eax, [eax + 8]
|
lea eax, [eax + 8]
|
||||||
vcvtdq2ps ymm0, ymm0 // X 8 floats
|
vcvtdq2ps ymm0, ymm0 // X 8 floats
|
||||||
vmulps ymm2, ymm0, ymm0 // X * X
|
vmulps ymm2, ymm0, ymm0 // X * X
|
||||||
vmulps ymm3, ymm0, ymm7 // C3 * X
|
vmulps ymm3, ymm0, ymm7 // C3 * X
|
||||||
vmulps ymm1, ymm0, ymm5 // C1 * X
|
vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
|
||||||
vmulps ymm3, ymm2, ymm3 // C3 * X * X * X
|
vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
|
||||||
vmulps ymm2, ymm2, ymm6 // C2 * X * X
|
vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
|
||||||
vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X
|
vcvttps2dq ymm0, ymm0
|
||||||
vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X
|
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||||
vaddps ymm1, ymm1, ymm2 // result += C2 * X * X
|
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||||
vcvttps2dq ymm1, ymm1
|
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
|
||||||
vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
sub ecx, 2
|
||||||
vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
vmovq qword ptr [edx], xmm0
|
||||||
vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000
|
lea edx, [edx + 8]
|
||||||
sub ecx, 2
|
jg convertloop
|
||||||
vmovq qword ptr [edx], xmm1
|
|
||||||
lea edx, [edx + 8]
|
|
||||||
jg convertloop
|
|
||||||
vzeroupper
|
vzeroupper
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
|
|||||||
@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) {
|
|||||||
printf("Has AVX2 %x\n", has_avx2);
|
printf("Has AVX2 %x\n", has_avx2);
|
||||||
int has_erms = TestCpuFlag(kCpuHasERMS);
|
int has_erms = TestCpuFlag(kCpuHasERMS);
|
||||||
printf("Has ERMS %x\n", has_erms);
|
printf("Has ERMS %x\n", has_erms);
|
||||||
|
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
|
||||||
|
printf("Has FMA3 %x\n", has_fma3);
|
||||||
int has_mips = TestCpuFlag(kCpuHasMIPS);
|
int has_mips = TestCpuFlag(kCpuHasMIPS);
|
||||||
printf("Has MIPS %x\n", has_mips);
|
printf("Has MIPS %x\n", has_mips);
|
||||||
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
|
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
|
||||||
@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) {
|
|||||||
TEST_F(libyuvTest, TestLinuxNeon) {
|
TEST_F(libyuvTest, TestLinuxNeon) {
|
||||||
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
|
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
|
||||||
if (testdata) {
|
if (testdata) {
|
||||||
EXPECT_EQ(0,
|
EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
|
||||||
ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
|
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
|
||||||
EXPECT_EQ(kCpuHasNEON,
|
|
||||||
ArmCpuCaps("unit_test/testdata/tegra3.txt"));
|
|
||||||
} else {
|
} else {
|
||||||
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
|
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
int has_avx = TestCpuFlag(kCpuHasAVX);
|
int has_avx = TestCpuFlag(kCpuHasAVX);
|
||||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
int has_erms = TestCpuFlag(kCpuHasERMS);
|
int has_erms = TestCpuFlag(kCpuHasERMS);
|
||||||
|
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
|
||||||
printf("Has SSE2 %x\n", has_sse2);
|
printf("Has SSE2 %x\n", has_sse2);
|
||||||
printf("Has SSSE3 %x\n", has_ssse3);
|
printf("Has SSSE3 %x\n", has_ssse3);
|
||||||
printf("Has SSE4.1 %x\n", has_sse41);
|
printf("Has SSE4.1 %x\n", has_sse41);
|
||||||
@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) {
|
|||||||
printf("Has AVX %x\n", has_avx);
|
printf("Has AVX %x\n", has_avx);
|
||||||
printf("Has AVX2 %x\n", has_avx2);
|
printf("Has AVX2 %x\n", has_avx2);
|
||||||
printf("Has ERMS %x\n", has_erms);
|
printf("Has ERMS %x\n", has_erms);
|
||||||
|
printf("Has FMA3 %x\n", has_fma3);
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user