diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h index 8a81e8213..7109fbab1 100644 --- a/include/libyuv/macros_msa.h +++ b/include/libyuv/macros_msa.h @@ -15,6 +15,98 @@ #include #include +#if (__mips_isa_rev >= 6) + #define LW(psrc) ( { \ + uint8 *psrc_lw_m = (uint8 *) (psrc); \ + uint32 val_m; \ + \ + asm volatile ( \ + "lw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #if (__mips == 64) + #define LD(psrc) ( { \ + uint8 *psrc_ld_m = (uint8 *) (psrc); \ + uint64 val_m = 0; \ + \ + asm volatile ( \ + "ld %[val_m], %[psrc_ld_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_ld_m] "m" (*psrc_ld_m) \ + ); \ + \ + val_m; \ + } ) + #else // !(__mips == 64) + #define LD(psrc) ( { \ + uint8 *psrc_ld_m = (uint8 *) (psrc); \ + uint32 val0_m, val1_m; \ + uint64 val_m = 0; \ + \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + \ + val_m = (uint64) (val1_m); \ + val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64) (val_m | (uint64) val0_m); \ + \ + val_m; \ + } ) + #endif // (__mips == 64) +#else // !(__mips_isa_rev >= 6) + #define LW(psrc) ( { \ + uint8 *psrc_lw_m = (uint8 *) (psrc); \ + uint32 val_m; \ + \ + asm volatile ( \ + "ulw %[val_m], %[psrc_lw_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_lw_m] "m" (*psrc_lw_m) \ + ); \ + \ + val_m; \ + } ) + + #if (__mips == 64) + #define LD(psrc) ( { \ + uint8 *psrc_ld_m = (uint8 *) (psrc); \ + uint64 val_m = 0; \ + \ + asm volatile ( \ + "uld %[val_m], %[psrc_ld_m] \n\t" \ + \ + : [val_m] "=r" (val_m) \ + : [psrc_ld_m] "m" (*psrc_ld_m) \ + ); \ + \ + val_m; \ + } ) + #else // !(__mips == 64) + #define LD(psrc) ( { \ + uint8 *psrc_ld_m = (uint8 *) (psrc); \ + uint32 val0_m, val1_m; \ + uint64 val_m = 0; \ + \ + val0_m = LW(psrc_ld_m); \ + val1_m = LW(psrc_ld_m + 4); \ + \ + val_m = (uint64) (val1_m); \ + val_m = (uint64) ((val_m << 32) & 0xFFFFFFFF00000000); \ + val_m = (uint64) (val_m | (uint64) val0_m); \ + \ + val_m; \ + } ) + #endif // (__mips == 64) +#endif // (__mips_isa_rev >= 6) + #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 601e05acc..04ef2250b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -382,7 +382,8 @@ extern "C" { #define HAS_ARGB4444TOARGBROW_MSA #define HAS_ARGBTOYROW_MSA #define HAS_ARGBTOUVROW_MSA - +#define HAS_I422TOARGBROW_MSA +#define HAS_I422TORGBAROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -651,6 +652,18 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, + const struct YuvConstants* yuvconstants, + int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); @@ -1629,6 +1642,18 @@ void I422ToARGBRow_DSPR2(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I422ToRGBARow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index da48b1859..6e23cf743 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -102,6 +102,14 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -272,6 +280,14 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); diff --git a/source/convert_from.cc b/source/convert_from.cc index 89d24f474..7847622cd 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -459,6 +459,14 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); @@ -848,6 +856,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 7a10a69f7..cb17130a1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1243,6 +1243,14 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y, I422ToRGBARow = I422ToRGBARow_DSPR2; } #endif +#if defined(HAS_I422TORGBAROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToRGBARow = I422ToRGBARow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + I422ToRGBARow = I422ToRGBARow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); diff --git a/source/row_any.cc b/source/row_any.cc index 07e606c6e..6b8962578 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -165,6 +165,10 @@ ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif +#ifdef HAS_I422TOARGBROW_MSA +ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) +ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) +#endif #undef ANY31C // Any 2 planes to 1. diff --git a/source/row_msa.cc b/source/row_msa.cc index 3a55b0b75..d0990b4ff 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -19,6 +19,66 @@ namespace libyuv { extern "C" { #endif +#define I422TORGB(in0, in1, in2, ub, vr, ug, vg, \ + bb, bg, br, yg, out0, out1, out2) { \ + v8i16 vec0_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ + v4i32 max_val_m = __msa_ldi_w(255); \ + v8i16 zero_m = { 0 }; \ + \ + in1 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in1); \ + in2 = (v16u8) __msa_ilvr_b((v16i8) in2, (v16i8) in2); \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) in0, (v16i8) in0); \ + reg0_m = (v4i32) __msa_ilvr_h(zero_m, vec0_m); \ + reg1_m = (v4i32) __msa_ilvl_h(zero_m, vec0_m); \ + reg0_m *= vec_yg; \ + reg1_m *= vec_yg; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = reg0_m + br; \ + reg5_m = reg1_m + br; \ + reg2_m = reg0_m + bg; \ + reg3_m = reg1_m + bg; \ + reg0_m += bb; \ + reg1_m += bb; \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in1); \ + reg6_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ + reg7_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ + vec0_m = (v8i16) __msa_ilvr_b((v16i8) zero_m, (v16i8) in2); \ + reg8_m = (v4i32) __msa_ilvr_h(zero_m, (v8i16) vec0_m); \ + reg9_m = (v4i32) __msa_ilvl_h(zero_m, (v8i16) vec0_m); \ + reg0_m -= reg6_m * ub; \ + reg1_m -= reg7_m * ub; \ + reg2_m -= reg6_m * ug; \ + reg3_m -= reg7_m * ug; \ + reg4_m -= reg8_m * vr; \ + reg5_m -= reg9_m * vr; \ + reg2_m -= reg8_m * vg; \ + reg3_m -= reg9_m * vg; \ + reg0_m = __msa_srai_w(reg0_m, 6); \ + reg1_m = __msa_srai_w(reg1_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg0_m = __msa_maxi_s_w(reg0_m, 0); \ + reg1_m = __msa_maxi_s_w(reg1_m, 0); \ + reg2_m = __msa_maxi_s_w(reg2_m, 0); \ + reg3_m = __msa_maxi_s_w(reg3_m, 0); \ + reg4_m = __msa_maxi_s_w(reg4_m, 0); \ + reg5_m = __msa_maxi_s_w(reg5_m, 0); \ + reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ + reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ + reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ + reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ + reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ + reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ + out0 = __msa_pckev_h((v8i16) reg1_m, (v8i16) reg0_m); \ + out1 = __msa_pckev_h((v8i16) reg3_m, (v8i16) reg2_m); \ + out2 = __msa_pckev_h((v8i16) reg5_m, (v8i16) reg4_m); \ +} + void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { int x; v16u8 src0, src1, src2, src3; @@ -101,6 +161,90 @@ void I422ToUYVYRow_MSA(const uint8* src_y, } } +void I422ToARGBRow_MSA(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { + int x; + int32 data_u, data_v; + int64 data_y; + v16u8 src0, src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 const_255 = (v16u8) __msa_ldi_b(255); + v4i32 zero = { 0 }; + + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); + + for (x = 0; x < width; x += 8) { + data_y = LD(src_y); + data_u = LW(src_u); + data_v = LW(src_v); + src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); + src1 = (v16u8) __msa_insert_w(zero, 0, data_u); + src2 = (v16u8) __msa_insert_w(zero, 0, data_v); + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); + vec0 = (v8i16) __msa_ilvev_b((v16i8) vec1, (v16i8) vec0); + vec1 = (v8i16) __msa_ilvev_b((v16i8) const_255, (v16i8) vec2); + dst0 = (v16u8) __msa_ilvr_h((v8i16) vec1, (v8i16) vec0); + dst1 = (v16u8) __msa_ilvl_h((v8i16) vec1, (v8i16) vec0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_u += 4; + src_v += 4; + rgb_buf += 32; + } +} + +void I422ToRGBARow_MSA(const uint8* src_y, const uint8* src_u, + const uint8* src_v, uint8* rgb_buf, + const struct YuvConstants* yuvconstants, int width) { + int x; + int64 data_y; + int32 data_u, data_v; + v16u8 src0, src1, src2, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v16u8 const_255 = (v16u8) __msa_ldi_b(255); + v4i32 zero = { 0 }; + + vec_ub = __msa_fill_w(yuvconstants->kUVToB[0]); + vec_vr = __msa_fill_w(yuvconstants->kUVToR[1]); + vec_ug = __msa_fill_w(yuvconstants->kUVToG[0]); + vec_vg = __msa_fill_w(yuvconstants->kUVToG[1]); + vec_bb = __msa_fill_w(yuvconstants->kUVBiasB[0]); + vec_bg = __msa_fill_w(yuvconstants->kUVBiasG[0]); + vec_br = __msa_fill_w(yuvconstants->kUVBiasR[0]); + vec_yg = __msa_fill_w(yuvconstants->kYToRgb[0]); + + for (x = 0; x < width; x += 8) { + data_y = LD(src_y); + data_u = LW(src_u); + data_v = LW(src_v); + src0 = (v16u8) __msa_insert_d((v2i64) zero, 0, data_y); + src1 = (v16u8) __msa_insert_w(zero, 0, data_u); + src2 = (v16u8) __msa_insert_w(zero, 0, data_v); + I422TORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, + vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); + vec0 = (v8i16) __msa_ilvev_b((v16i8) vec0, (v16i8) const_255); + vec1 = (v8i16) __msa_ilvev_b((v16i8) vec2, (v16i8) vec1); + dst0 = (v16u8) __msa_ilvr_h(vec1, vec0); + dst1 = (v16u8) __msa_ilvl_h(vec1, vec0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_u += 4; + src_v += 4; + rgb_buf += 32; + } +} + void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 17f51ae9b..b2a139677 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -474,6 +474,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, I422ToARGBRow = I422ToARGBRow_DSPR2; } #endif +#if defined(HAS_I422TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToARGBRow = I422ToARGBRow_Any_MSA; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_MSA; + } + } +#endif void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) =