mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-11 04:39:53 +08:00
polynomial sse2 do 2 pixels at a time.
BUG=265 TEST=*Poly* R=changjun.yang@intel.com Review URL: https://webrtc-codereview.appspot.com/2195004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@782 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
5442018d64
commit
c3c06ec328
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 781
|
Version: 782
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 781
|
#define LIBYUV_VERSION 782
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -2052,7 +2052,7 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
|||||||
uint8* dst_argb, const float* poly,
|
uint8* dst_argb, const float* poly,
|
||||||
int width) = ARGBPolynomialRow_C;
|
int width) = ARGBPolynomialRow_C;
|
||||||
#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
|
#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
|
||||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
|
||||||
ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
|
ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -6774,42 +6774,53 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
|||||||
uint8* dst_argb, const float* poly,
|
uint8* dst_argb, const float* poly,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 12] /* poly */
|
push esi
|
||||||
movdqu xmm4, [eax]
|
mov eax, [esp + 4 + 4] /* src_argb */
|
||||||
movdqu xmm5, [eax + 16]
|
mov edx, [esp + 4 + 8] /* dst_argb */
|
||||||
movdqu xmm6, [eax + 32]
|
mov esi, [esp + 4 + 12] /* poly */
|
||||||
movdqu xmm7, [eax + 48]
|
mov ecx, [esp + 4 + 16] /* width */
|
||||||
|
|
||||||
mov eax, [esp + 4] /* src_argb */
|
|
||||||
mov edx, [esp + 8] /* dst_argb */
|
|
||||||
mov ecx, [esp + 16] /* width */
|
|
||||||
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
// (slow) vpmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
// (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||||
movd xmm0, [eax] // BGRA
|
movq xmm0, qword ptr [eax] // BGRABGRA
|
||||||
lea eax, [eax + 4]
|
lea eax, [eax + 8]
|
||||||
punpcklbw xmm0, xmm3
|
punpcklbw xmm0, xmm3
|
||||||
punpcklwd xmm0, xmm3
|
movdqa xmm4, xmm0
|
||||||
|
punpcklwd xmm0, xmm3 // pixel 0
|
||||||
|
punpckhwd xmm4, xmm3 // pixel 1
|
||||||
cvtdq2ps xmm0, xmm0 // 4 floats
|
cvtdq2ps xmm0, xmm0 // 4 floats
|
||||||
|
cvtdq2ps xmm4, xmm4
|
||||||
movdqa xmm1, xmm0 // X
|
movdqa xmm1, xmm0 // X
|
||||||
mulps xmm0, xmm5 // C1 * X
|
movdqa xmm5, xmm4
|
||||||
addps xmm0, xmm4 // result = C0 + C1 * X
|
mulps xmm0, [esi + 16] // C1 * X
|
||||||
|
mulps xmm4, [esi + 16]
|
||||||
|
addps xmm0, [esi] // result = C0 + C1 * X
|
||||||
|
addps xmm4, [esi]
|
||||||
movdqa xmm2, xmm1
|
movdqa xmm2, xmm1
|
||||||
|
movdqa xmm6, xmm5
|
||||||
mulps xmm2, xmm1 // X * X
|
mulps xmm2, xmm1 // X * X
|
||||||
|
mulps xmm6, xmm5
|
||||||
mulps xmm1, xmm2 // X * X * X
|
mulps xmm1, xmm2 // X * X * X
|
||||||
mulps xmm2, xmm6 // C2 * X * X
|
mulps xmm5, xmm6
|
||||||
mulps xmm1, xmm7 // C3 * X * X * X
|
mulps xmm2, [esi + 32] // C2 * X * X
|
||||||
|
mulps xmm6, [esi + 32]
|
||||||
|
mulps xmm1, [esi + 48] // C3 * X * X * X
|
||||||
|
mulps xmm5, [esi + 48]
|
||||||
addps xmm0, xmm2 // result += C2 * X * X
|
addps xmm0, xmm2 // result += C2 * X * X
|
||||||
|
addps xmm4, xmm6
|
||||||
addps xmm0, xmm1 // result += C3 * X * X * X
|
addps xmm0, xmm1 // result += C3 * X * X * X
|
||||||
|
addps xmm4, xmm5
|
||||||
cvttps2dq xmm0, xmm0
|
cvttps2dq xmm0, xmm0
|
||||||
|
cvttps2dq xmm4, xmm4
|
||||||
|
packuswb xmm0, xmm4
|
||||||
packuswb xmm0, xmm0
|
packuswb xmm0, xmm0
|
||||||
packuswb xmm0, xmm0
|
sub ecx, 2
|
||||||
sub ecx, 1
|
movq qword ptr [edx], xmm0
|
||||||
movd [edx], xmm0
|
lea edx, [edx + 8]
|
||||||
lea edx, [edx + 4]
|
|
||||||
jg convertloop
|
jg convertloop
|
||||||
|
pop esi
|
||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1660,7 +1660,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) {
|
|||||||
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
|
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
|
||||||
SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
|
SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
|
||||||
|
|
||||||
static const float kWarmifyPolynomial[16] = {
|
SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
|
||||||
0.94230f, -3.03300f, -2.92500f, 0.f, // C0
|
0.94230f, -3.03300f, -2.92500f, 0.f, // C0
|
||||||
0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
|
0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
|
||||||
0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
|
0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user