Remove alignment constraint for SSE2. Allows the optimized function to be used with unaligned memory, improving performance in that use case. Hurts performance on core2 and prior where memory was faster with movdqa instruction.

BUG=365
TESTED=psnr, ssim and djb2 unittests pass.
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/22859004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1100 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2014-09-30 18:53:34 +00:00
parent bb5cc129e5
commit 9c4c82181b
4 changed files with 31 additions and 6 deletions

View File

@ -114,8 +114,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
}
#endif
#if defined(HAS_SUMSQUAREERROR_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
if (TestCpuFlag(kCpuHasSSE2)) {
// Note only used for multiples of 16 so count is not checked.
SumSquareError = SumSquareError_SSE2;
}

View File

@ -25,9 +25,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqa " MEMACCESS(0) ",%%xmm1 \n"
"movdqu " MEMACCESS(0) ",%%xmm1 \n"
"lea " MEMLEA(0x10, 0) ",%0 \n"
"movdqa " MEMACCESS(1) ",%%xmm2 \n"
"movdqu " MEMACCESS(1) ",%%xmm2 \n"
"lea " MEMLEA(0x10, 1) ",%1 \n"
"sub $0x10,%2 \n"
"movdqa %%xmm1,%%xmm3 \n"

View File

@ -29,9 +29,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
align 4
wloop:
movdqa xmm1, [eax]
movdqu xmm1, [eax]
lea eax, [eax + 16]
movdqa xmm2, [edx]
movdqu xmm2, [edx]
lea edx, [edx + 16]
sub ecx, 16
movdqa xmm3, xmm1 // abs trick

View File

@ -244,6 +244,32 @@ TEST_F(libyuvTest, BenchmarkPsnr_Opt) {
free_aligned_buffer_64(src_b);
}
TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) {
align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1);
align_buffer_64(src_b, benchmark_width_ * benchmark_height_);
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
src_a[i + 1] = i;
src_b[i] = i;
}
MaskCpuFlags(-1);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i)
CalcFramePsnr(src_a + 1, benchmark_width_,
src_b, benchmark_width_,
benchmark_width_, benchmark_height_);
opt_time = (get_time() - opt_time) / benchmark_iterations_;
printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
EXPECT_EQ(0, 0);
free_aligned_buffer_64(src_a);
free_aligned_buffer_64(src_b);
}
TEST_F(libyuvTest, Psnr) {
const int kSrcWidth = benchmark_width_;
const int kSrcHeight = benchmark_height_;