diff --git a/source/compare.cc b/source/compare.cc index dc715e019..255e77276 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -114,8 +114,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #if defined(HAS_SUMSQUAREERROR_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) { + if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } diff --git a/source/compare_posix.cc b/source/compare_posix.cc index ac361190e..64dfc3578 100644 --- a/source/compare_posix.cc +++ b/source/compare_posix.cc @@ -25,9 +25,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" - "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "lea " MEMLEA(0x10, 0) ",%0 \n" - "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "lea " MEMLEA(0x10, 1) ",%1 \n" "sub $0x10,%2 \n" "movdqa %%xmm1,%%xmm3 \n" diff --git a/source/compare_win.cc b/source/compare_win.cc index 99831651f..50d4d3464 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -29,9 +29,9 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { align 4 wloop: - movdqa xmm1, [eax] + movdqu xmm1, [eax] lea eax, [eax + 16] - movdqa xmm2, [edx] + movdqu xmm2, [edx] lea edx, [edx + 16] sub ecx, 16 movdqa xmm3, xmm1 // abs trick diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 141445ec0..464e2559e 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -244,6 +244,32 @@ TEST_F(libyuvTest, BenchmarkPsnr_Opt) { free_aligned_buffer_64(src_b); } + +TEST_F(libyuvTest, BenchmarkPsnr_Unaligned) { + align_buffer_64(src_a, benchmark_width_ * benchmark_height_ + 1); + align_buffer_64(src_b, benchmark_width_ * benchmark_height_); + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + src_a[i + 1] = i; + src_b[i] = i; + } + + MaskCpuFlags(-1); + + double opt_time = get_time(); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a + 1, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); + + opt_time = (get_time() - opt_time) / benchmark_iterations_; + printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6); + + EXPECT_EQ(0, 0); + + free_aligned_buffer_64(src_a); + free_aligned_buffer_64(src_b); +} + TEST_F(libyuvTest, Psnr) { const int kSrcWidth = benchmark_width_; const int kSrcHeight = benchmark_height_;