diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 8e673291b..9a39f163b 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -34,6 +34,7 @@ uint32 HammingDistance_SSE42(const uint8* src_a, "xor %%r13,%%r13 \n" "xor %%r12,%%r12 \n" + // Process 32 bytes per loop. LABELALIGN "1: \n" "mov (%0),%%rax \n" @@ -56,7 +57,6 @@ uint32 HammingDistance_SSE42(const uint8* src_a, "add %%rsi,%%r12 \n" "sub $0x20,%2 \n" "jg 1b \n" - "add %%r15, %%r14 \n" "add %%r13, %%r12 \n" "add %%r14, %%r12 \n" @@ -76,34 +76,36 @@ uint32 HammingDistance_SSE42(const uint8* src_a, int count) { uint32 diff = 0u; - asm volatile(LABELALIGN - "1: \n" - "mov (%0),%%eax \n" - "mov 0x4(%0),%%edx \n" - "xor (%1),%%eax \n" - "xor 0x4(%1),%%edx \n" - "popcnt %%eax,%%eax \n" - "add %%eax,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "mov 0x8(%0),%%eax \n" - "mov 0xc(%0),%%edx \n" - "xor 0x8(%1),%%eax \n" - "xor 0xc(%1),%%edx \n" - "popcnt %%eax,%%eax \n" - "add %%eax,%3 \n" - "popcnt %%edx,%%edx \n" - "add %%edx,%3 \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_a), // %0 - "+r"(src_b), // %1 - "+r"(count), // %2 - "+r"(diff) // %3 - : - : "memory", "cc", "eax", "edx"); + asm volatile( + // Process 16 bytes per loop. + LABELALIGN + "1: \n" + "mov (%0),%%eax \n" + "mov 0x4(%0),%%edx \n" + "xor (%1),%%eax \n" + "xor 0x4(%1),%%edx \n" + "popcnt %%eax,%%eax \n" + "add %%eax,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "mov 0x8(%0),%%eax \n" + "mov 0xc(%0),%%edx \n" + "xor 0x8(%1),%%eax \n" + "xor 0xc(%1),%%edx \n" + "popcnt %%eax,%%eax \n" + "add %%eax,%3 \n" + "popcnt %%edx,%%edx \n" + "add %%edx,%3 \n" + "add $0x10,%0 \n" + "add $0x10,%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "+r"(diff) // %3 + : + : "memory", "cc", "eax", "edx"); return diff; } diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 7258809a8..ff39b2b0f 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -253,10 +253,9 @@ TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } -#else +#else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif - } EXPECT_EQ(h0, h1); @@ -339,7 +338,7 @@ static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { uint32 h1 = 0; - const int kMaxWidth =benchmark_width_ * benchmark_height_; + const int kMaxWidth = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 255u, kMaxWidth); @@ -375,7 +374,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } -#else +#else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } @@ -383,7 +382,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { // A large count will cause the low level to potentially overflow so the // result can not be expected to be correct. // TODO(fbarchard): Consider expecting the low 16 bits to match. - if (kMaxWidth<= kMaxOptCount) { + if (kMaxWidth <= kMaxOptCount) { EXPECT_EQ(kMaxWidth * 8U, h1); } else { if (kMaxWidth * 8ULL != static_cast(h1)) {