mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
FMA3 version of Polynomial
BUG=265 TEST=cpuid and Polynomial unittest R=changjun.yang@intel.com Review URL: https://webrtc-codereview.appspot.com/2217004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@790 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
65d1ba6a26
commit
2bbb64df2c
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 789
|
||||
Version: 790
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Consider overlapping bits for different architectures.
|
||||
// Internal flag to indicate cpuid requires initialization.
|
||||
static const int kCpuInit = 0x1;
|
||||
|
||||
@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100;
|
||||
static const int kCpuHasAVX = 0x200;
|
||||
static const int kCpuHasAVX2 = 0x400;
|
||||
static const int kCpuHasERMS = 0x800;
|
||||
static const int kCpuHasFMA3 = 0x1000;
|
||||
// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
|
||||
|
||||
// These flags are only valid on MIPS processors.
|
||||
static const int kCpuHasMIPS = 0x1000;
|
||||
static const int kCpuHasMIPS_DSP = 0x2000;
|
||||
static const int kCpuHasMIPS_DSPR2 = 0x4000;
|
||||
static const int kCpuHasMIPS = 0x10000;
|
||||
static const int kCpuHasMIPS_DSP = 0x20000;
|
||||
static const int kCpuHasMIPS_DSPR2 = 0x40000;
|
||||
|
||||
// Internal function used to auto-init.
|
||||
LIBYUV_API
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 789
|
||||
#define LIBYUV_VERSION 790
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -179,6 +179,7 @@ int InitCpuFlags(void) {
|
||||
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
|
||||
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
|
||||
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
|
||||
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
|
||||
kCpuHasX86;
|
||||
#ifdef HAS_XGETBV
|
||||
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
|
||||
@ -212,6 +213,9 @@ int InitCpuFlags(void) {
|
||||
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
|
||||
cpu_info_ &= ~kCpuHasERMS;
|
||||
}
|
||||
if (TestEnv("LIBYUV_DISABLE_FMA3")) {
|
||||
cpu_info_ &= ~kCpuHasFMA3;
|
||||
}
|
||||
#elif defined(__mips__) && defined(__linux__)
|
||||
// Linux mips parse text file for dsp detect.
|
||||
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
|
||||
|
||||
@ -2057,7 +2057,8 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) {
|
||||
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
|
||||
IS_ALIGNED(width, 2)) {
|
||||
ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -6834,10 +6834,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 12] /* poly */
|
||||
vmovdqu xmm4, [eax]
|
||||
vmovdqu xmm5, [eax + 16]
|
||||
vmovdqu xmm6, [eax + 32]
|
||||
vmovdqu xmm7, [eax + 48]
|
||||
vmovdqu xmm4, [eax] // C0
|
||||
vmovdqu xmm5, [eax + 16] // C1
|
||||
vmovdqu xmm6, [eax + 32] // C2
|
||||
vmovdqu xmm7, [eax + 48] // C3
|
||||
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
|
||||
vpermq ymm5, ymm5, 0x44
|
||||
vpermq ymm6, ymm6, 0x44
|
||||
@ -6850,25 +6850,22 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
// 2 pixel loop.
|
||||
align 16
|
||||
convertloop:
|
||||
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
||||
lea eax, [eax + 8]
|
||||
vcvtdq2ps ymm0, ymm0 // X 8 floats
|
||||
vmulps ymm2, ymm0, ymm0 // X * X
|
||||
vmulps ymm3, ymm0, ymm7 // C3 * X
|
||||
vmulps ymm1, ymm0, ymm5 // C1 * X
|
||||
vmulps ymm3, ymm2, ymm3 // C3 * X * X * X
|
||||
vmulps ymm2, ymm2, ymm6 // C2 * X * X
|
||||
vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X
|
||||
vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X
|
||||
vaddps ymm1, ymm1, ymm2 // result += C2 * X * X
|
||||
vcvttps2dq ymm1, ymm1
|
||||
vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||
vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||
vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000
|
||||
sub ecx, 2
|
||||
vmovq qword ptr [edx], xmm1
|
||||
lea edx, [edx + 8]
|
||||
jg convertloop
|
||||
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
||||
lea eax, [eax + 8]
|
||||
vcvtdq2ps ymm0, ymm0 // X 8 floats
|
||||
vmulps ymm2, ymm0, ymm0 // X * X
|
||||
vmulps ymm3, ymm0, ymm7 // C3 * X
|
||||
vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
|
||||
vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
|
||||
vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
|
||||
vcvttps2dq ymm0, ymm0
|
||||
vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||
vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
|
||||
sub ecx, 2
|
||||
vmovq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
|
||||
@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) {
|
||||
printf("Has AVX2 %x\n", has_avx2);
|
||||
int has_erms = TestCpuFlag(kCpuHasERMS);
|
||||
printf("Has ERMS %x\n", has_erms);
|
||||
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
|
||||
printf("Has FMA3 %x\n", has_fma3);
|
||||
int has_mips = TestCpuFlag(kCpuHasMIPS);
|
||||
printf("Has MIPS %x\n", has_mips);
|
||||
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
|
||||
@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) {
|
||||
TEST_F(libyuvTest, TestLinuxNeon) {
|
||||
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
|
||||
if (testdata) {
|
||||
EXPECT_EQ(0,
|
||||
ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
|
||||
EXPECT_EQ(kCpuHasNEON,
|
||||
ArmCpuCaps("unit_test/testdata/tegra3.txt"));
|
||||
EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
|
||||
EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
|
||||
} else {
|
||||
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
|
||||
}
|
||||
|
||||
@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
|
||||
int has_avx = TestCpuFlag(kCpuHasAVX);
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
int has_erms = TestCpuFlag(kCpuHasERMS);
|
||||
int has_fma3 = TestCpuFlag(kCpuHasFMA3);
|
||||
printf("Has SSE2 %x\n", has_sse2);
|
||||
printf("Has SSSE3 %x\n", has_ssse3);
|
||||
printf("Has SSE4.1 %x\n", has_sse41);
|
||||
@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) {
|
||||
printf("Has AVX %x\n", has_avx);
|
||||
printf("Has AVX2 %x\n", has_avx2);
|
||||
printf("Has ERMS %x\n", has_erms);
|
||||
printf("Has FMA3 %x\n", has_fma3);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user