diff --git a/Android.mk b/Android.mk index e03459264..854020610 100644 --- a/Android.mk +++ b/Android.mk @@ -9,6 +9,7 @@ LOCAL_SRC_FILES := \ source/compare.cc \ source/compare_common.cc \ source/compare_gcc.cc \ + source/compare_msa.cc \ source/compare_neon.cc \ source/compare_neon64.cc \ source/convert.cc \ diff --git a/BUILD.gn b/BUILD.gn index 3eefc3616..34a9975bf 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -199,6 +199,7 @@ if (libyuv_use_msa) { static_library("libyuv_msa") { sources = [ # MSA Source Files + "source/compare_msa.cc", "source/rotate_msa.cc", "source/row_msa.cc", "source/scale_msa.cc", diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h index 971aecf3c..0711898ea 100644 --- a/include/libyuv/compare_row.h +++ b/include/libyuv/compare_row.h @@ -42,6 +42,11 @@ extern "C" { #endif // clang >= 3.4 #endif // __clang__ +// clang 6 mips issue https://bugs.chromium.org/p/libyuv/issues/detail?id=715 +#if defined(__clang__) +#define DISABLE_CLANG_MSA 1 +#endif + #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 @@ -69,14 +74,24 @@ extern "C" { #define HAS_HAMMINGDISTANCE_NEON #endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_HAMMINGDISTANCE_MSA + +#ifndef DISABLE_CLANG_MSA +#define HAS_SUMSQUAREERROR_MSA +#endif +#endif + uint32 HammingDistance_C(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_X86(const uint8* src_a, const uint8* src_b, int count); uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count); +uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); +uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count); uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); diff --git a/libyuv.gypi b/libyuv.gypi index 18b2feca5..ec81bc9bb 100644 --- a/libyuv.gypi +++ b/libyuv.gypi @@ -35,6 +35,7 @@ 'source/compare.cc', 'source/compare_common.cc', 'source/compare_gcc.cc', + 'source/compare_msa.cc', 'source/compare_neon.cc', 'source/compare_neon64.cc', 'source/compare_win.cc', diff --git a/source/compare.cc b/source/compare.cc index 3f7f14751..20afa0cef 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -135,6 +135,11 @@ uint64 ComputeHammingDistance(const uint8* src_a, HammingDistance = HammingDistance_AVX2; } #endif +#if defined(HAS_HAMMINGDISTANCE_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + HammingDistance = HammingDistance_MSA; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : diff) #endif @@ -186,6 +191,11 @@ uint64 ComputeSumSquareError(const uint8* src_a, SumSquareError = SumSquareError_AVX2; } #endif +#if defined(HAS_SUMSQUAREERROR_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SumSquareError = SumSquareError_MSA; + } +#endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif diff --git a/source/compare_msa.cc b/source/compare_msa.cc new file mode 100644 index 000000000..da484d5cd --- /dev/null +++ b/source/compare_msa.cc @@ -0,0 +1,95 @@ +/* + * Copyright 2017 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" +#include "libyuv/row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32 HammingDistance_MSA(const uint8* src_a, const uint8* src_b, int count) { + uint32 diff = 0u; + int i; + v16u8 src0, src1, src2, src3; + v2i64 vec0 = {0}, vec1 = {0}; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + src0 ^= src2; + src1 ^= src3; + vec0 += __msa_pcnt_d((v2i64)src0); + vec1 += __msa_pcnt_d((v2i64)src1); + src_a += 32; + src_b += 32; + } + + vec0 += vec1; + diff = (uint32)__msa_copy_u_w((v4i32)vec0, 0); + diff += (uint32)__msa_copy_u_w((v4i32)vec0, 2); + return diff; +} + +#ifndef DISABLE_CLANG_MSA +uint32 SumSquareError_MSA(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse = 0u; + int i; + v16u8 src0, src1, src2, src3; + v8i16 vec0, vec1, vec2, vec3; + v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; + v2i64 tmp0; + + for (i = 0; i < count; i += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); + vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); + reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); + reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); + reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); + src_a += 32; + src_b += 32; + } + + reg0 += reg1; + reg2 += reg3; + reg0 += reg2; + tmp0 = __msa_hadd_s_d(reg0, reg0); + sse = (uint32)__msa_copy_u_w((v4i32)tmp0, 0); + sse += (uint32)__msa_copy_u_w((v4i32)tmp0, 2); + return sse; +} +#endif + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)