mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-15 22:59:53 +08:00
Improved polynomial for avx2 using vpmovzxbd and remove movdqa.
BUG=265 TESTED=libyuvTest.TestARGBPolynomial R=jingning@google.com, ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/2184005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@781 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
6da76f3b34
commit
5442018d64
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 780
|
Version: 781
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 780
|
#define LIBYUV_VERSION 781
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -6787,6 +6787,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
|
// (slow) vpmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||||
movd xmm0, [eax] // BGRA
|
movd xmm0, [eax] // BGRA
|
||||||
lea eax, [eax + 4]
|
lea eax, [eax + 4]
|
||||||
punpcklbw xmm0, xmm3
|
punpcklbw xmm0, xmm3
|
||||||
@ -6833,39 +6834,26 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||||||
mov eax, [esp + 4] /* src_argb */
|
mov eax, [esp + 4] /* src_argb */
|
||||||
mov edx, [esp + 8] /* dst_argb */
|
mov edx, [esp + 8] /* dst_argb */
|
||||||
mov ecx, [esp + 16] /* width */
|
mov ecx, [esp + 16] /* width */
|
||||||
vpxor ymm3, ymm3, ymm3 // 0 constant for zero extending bytes to ints.
|
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
vmovq xmm0, qword ptr [eax] // 2 BGRA pixels
|
vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
|
||||||
lea eax, [eax + 8]
|
lea eax, [eax + 8]
|
||||||
|
vcvtdq2ps ymm0, ymm0 // X 8 floats
|
||||||
// vpmovzxbd ymm0, ymm0
|
vmulps ymm2, ymm0, ymm0 // X * X
|
||||||
// TODO(fbarchard): Consider vex256 to avoid vpermq.
|
vmulps ymm3, ymm0, ymm7 // C3 * X
|
||||||
vpunpcklbw xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
vmulps ymm1, ymm0, ymm5 // C1 * X
|
||||||
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
vmulps ymm3, ymm2, ymm3 // C3 * X * X * X
|
||||||
vpunpcklwd ymm0, ymm0, ymm3 // b000g000_r000a000_b000g000_r000a000
|
|
||||||
|
|
||||||
vcvtdq2ps ymm0, ymm0 // 8 floats
|
|
||||||
vmovdqa ymm1, ymm0 // X
|
|
||||||
vmulps ymm0, ymm0, ymm5 // C1 * X
|
|
||||||
vaddps ymm0, ymm0, ymm4 // result = C0 + C1 * X
|
|
||||||
vmovdqa ymm2, ymm1
|
|
||||||
vmulps ymm2, ymm2, ymm1 // X * X
|
|
||||||
vmulps ymm1, ymm1, ymm2 // X * X * X
|
|
||||||
vmulps ymm2, ymm2, ymm6 // C2 * X * X
|
vmulps ymm2, ymm2, ymm6 // C2 * X * X
|
||||||
vmulps ymm1, ymm1, ymm7 // C3 * X * X * X
|
vaddps ymm1, ymm1, ymm4 // result = C0 + C1 * X
|
||||||
vaddps ymm0, ymm0, ymm2 // result += C2 * X * X
|
vaddps ymm1, ymm1, ymm3 // result += C3 * X * X * X
|
||||||
vaddps ymm0, ymm0, ymm1 // result += C3 * X * X * X
|
vaddps ymm1, ymm1, ymm2 // result += C2 * X * X
|
||||||
vcvttps2dq ymm0, ymm0
|
vcvttps2dq ymm1, ymm1
|
||||||
|
vpackusdw ymm1, ymm1, ymm1 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
||||||
// vpmovzxdb ymm0, ymm0 // b000g000_r000a000_b000g000_r000a000
|
vpermq ymm1, ymm1, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
||||||
vpackusdw ymm0, ymm0, ymm3 // b0g0r0a0_00000000_b0g0r0a0_00000000
|
vpackuswb xmm1, xmm1, xmm1 // bgrabgra_00000000_00000000_00000000
|
||||||
vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
|
||||||
vpackuswb xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000
|
|
||||||
|
|
||||||
sub ecx, 2
|
sub ecx, 2
|
||||||
vmovq qword ptr [edx], xmm0
|
vmovq qword ptr [edx], xmm1
|
||||||
lea edx, [edx + 8]
|
lea edx, [edx + 8]
|
||||||
jg convertloop
|
jg convertloop
|
||||||
vzeroupper
|
vzeroupper
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user