diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7923583f3..16394da02 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -390,6 +390,9 @@ extern "C" { #define HAS_ARGBTOARGB1555ROW_MSA #define HAS_ARGBTOARGB4444ROW_MSA #define HAS_ARGBTOUV444ROW_MSA +#define HAS_ARGBMULTIPLYROW_MSA +#define HAS_ARGBADDROW_MSA +#define HAS_ARGBSUBTRACTROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1809,6 +1812,14 @@ void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBMultiplyRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBMultiplyRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); // ARGB add images. void ARGBAddRow_C(const uint8* src_argb, @@ -1839,6 +1850,14 @@ void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBAddRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBAddRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. @@ -1870,6 +1889,14 @@ void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBSubtractRow_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); +void ARGBSubtractRow_Any_MSA(const uint8* src_argb, + const uint8* src_argb1, + uint8* dst_argb, + int width); void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 06ac9e374..86854ae25 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1115,6 +1115,14 @@ int ARGBMultiply(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBMULTIPLYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; + if (IS_ALIGNED(width, 4)) { + ARGBMultiplyRow = ARGBMultiplyRow_MSA; + } + } +#endif // Multiply plane for (y = 0; y < height; ++y) { @@ -1184,6 +1192,14 @@ int ARGBAdd(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBADDROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAddRow = ARGBAddRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_MSA; + } + } +#endif // Add plane for (y = 0; y < height; ++y) { @@ -1248,6 +1264,14 @@ int ARGBSubtract(const uint8* src_argb0, } } #endif +#if defined(HAS_ARGBSUBTRACTROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBSubtractRow = ARGBSubtractRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_MSA; + } + } +#endif // Subtract plane for (y = 0; y < height; ++y) { diff --git a/source/row_any.cc b/source/row_any.cc index 7871d9c10..9599b63e9 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -234,6 +234,15 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) #ifdef HAS_ARGBSUBTRACTROW_NEON ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #endif +#ifdef HAS_ARGBMULTIPLYROW_MSA +ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_MSA +ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_MSA +ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) +#endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif diff --git a/source/row_msa.cc b/source/row_msa.cc index f47871fe7..de347f127 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -957,6 +957,87 @@ void ARGBToUV444Row_MSA(const uint8* src_argb, } } +void ARGBMultiplyRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, dst0; + v8u16 vec0, vec1, vec2, vec3; + v4u32 reg0, reg1, reg2, reg3; + v8i16 zero = {0}; + + for (x = 0; x < width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); + vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); + vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); + reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); + reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); + reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); + reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); + reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); + reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); + reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); + reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); + reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); + reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); + reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); + vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); + vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_argb); + src_argb0 += 16; + src_argb1 += 16; + dst_argb += 16; + } +} + +void ARGBAddRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + +void ARGBSubtractRow_MSA(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 8) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); + dst0 = __msa_subs_u_b(src0, src2); + dst1 = __msa_subs_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_argb, 16); + src_argb0 += 32; + src_argb1 += 32; + dst_argb += 32; + } +} + void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb, int width) {