mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
SSE2 bilinear fix for posix.
BUG=177 TEST=none Review URL: https://webrtc-codereview.appspot.com/1061004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@548 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
0f00506af7
commit
70b4928158
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 547
|
||||
Version: 548
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 547
|
||||
#define LIBYUV_VERSION 548
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -926,13 +926,14 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef SSE2_DISABLED
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
|
||||
// Normal formula for bilinear interpolation is:
|
||||
// source_y_fraction * row1 + (1 - source_y_fraction) row0
|
||||
// SSE2 version using the a single multiply of difference:
|
||||
// source_y_fraction * (row1 - row0) + row0
|
||||
#define HAS_SCALEFILTERROWS_SSE2_DISABLED
|
||||
// TODO(fbarchard): Specialize same as SSSE3.
|
||||
|
||||
#define HAS_SCALEFILTERROWS_SSE2
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
@ -948,13 +949,15 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
sub edi, esi
|
||||
cmp eax, 0
|
||||
je xloop1
|
||||
cmp eax, 128
|
||||
cmp eax, 128 // 50%?
|
||||
je xloop2
|
||||
|
||||
movd xmm5, eax // xmm5 = y fraction
|
||||
punpcklbw xmm5, xmm5
|
||||
psrlw xmm5, 1
|
||||
punpcklwd xmm5, xmm5
|
||||
pshufd xmm5, xmm5, 0
|
||||
punpckldq xmm5, xmm5
|
||||
punpcklqdq xmm5, xmm5
|
||||
pxor xmm4, xmm4
|
||||
|
||||
align 16
|
||||
@ -969,6 +972,8 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
punpckhbw xmm1, xmm4
|
||||
psubw xmm2, xmm0 // row1 - row0
|
||||
psubw xmm3, xmm1
|
||||
paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
|
||||
paddw xmm3, xmm3
|
||||
pmulhw xmm2, xmm5 // scale diff
|
||||
pmulhw xmm3, xmm5
|
||||
paddw xmm0, xmm2 // sum rows
|
||||
@ -1021,7 +1026,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // SSE2_DISABLED
|
||||
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
|
||||
#define HAS_SCALEFILTERROWS_SSSE3
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -1933,9 +1938,9 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
#ifndef SSE2_DISABLED
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
|
||||
#define HAS_SCALEFILTERROWS_SSE2_DISABLED
|
||||
// For more info see comment above ScaleFilterRows_SSE2 for MSVC++
|
||||
#define HAS_SCALEFILTERROWS_SSE2
|
||||
static void ScaleFilterRows_SSE2(uint8* dst_ptr,
|
||||
const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction) {
|
||||
@ -1945,10 +1950,13 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
|
||||
"je 2f \n"
|
||||
"cmp $0x80,%3 \n"
|
||||
"je 3f \n"
|
||||
|
||||
"movd %3,%%xmm5 \n"
|
||||
"punpcklbw %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x1,%%xmm5 \n"
|
||||
"punpcklwd %%xmm5,%%xmm5 \n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5 \n"
|
||||
"punpckldq %%xmm5,%%xmm5 \n"
|
||||
"punpcklqdq %%xmm5,%%xmm5 \n"
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
@ -1962,6 +1970,8 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
|
||||
"punpckhbw %%xmm4,%%xmm1 \n"
|
||||
"psubw %%xmm0,%%xmm2 \n"
|
||||
"psubw %%xmm1,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm2 \n"
|
||||
"paddw %%xmm3,%%xmm3 \n"
|
||||
"pmulhw %%xmm5,%%xmm2 \n"
|
||||
"pmulhw %%xmm5,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
@ -1996,16 +2006,15 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
|
||||
"movdqa %%xmm0,(%1,%0,1) \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(source_y_fraction) // %3
|
||||
: "r"(static_cast<intptr_t>(src_stride)) // %4
|
||||
"+r"(dst_width) // %2
|
||||
: "r"(source_y_fraction), // %3
|
||||
"r"(static_cast<intptr_t>(src_stride)) // %4
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // SSE2_DISABLED
|
||||
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
|
||||
#define HAS_SCALEFILTERROWS_SSSE3
|
||||
|
||||
@ -716,6 +716,7 @@ TESTINTERPOLATE(64)
|
||||
TESTINTERPOLATE(128)
|
||||
TESTINTERPOLATE(192)
|
||||
TESTINTERPOLATE(255)
|
||||
TESTINTERPOLATE(85)
|
||||
|
||||
static int TestBlend(int width, int height, int benchmark_iterations,
|
||||
int invert, int off) {
|
||||
|
||||
@ -77,7 +77,7 @@ static int ARGBTestFilter(int src_width, int src_height,
|
||||
|
||||
// Report performance of C vs OPT
|
||||
printf("filter %d - %8d us C - %8d us OPT\n",
|
||||
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
|
||||
f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
|
||||
@ -115,7 +115,7 @@ static int TestFilter(int src_width, int src_height,
|
||||
|
||||
// Report performance of C vs OPT
|
||||
printf("filter %d - %8d us C - %8d us OPT\n",
|
||||
f, static_cast<int>(c_time*1e6), static_cast<int>(opt_time*1e6));
|
||||
f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user