diff --git a/source/row_msa.cc b/source/row_msa.cc index c0b13b0fd..16d3f8390 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -24,16 +24,14 @@ extern "C" { #define ALPHA_VAL (-1) // Fill YUV -> RGB conversion constants into vectors -#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ { \ ub = __msa_fill_w(yuvconst->kUVToB[0]); \ vr = __msa_fill_w(yuvconst->kUVToR[1]); \ ug = __msa_fill_w(yuvconst->kUVToG[0]); \ vg = __msa_fill_w(yuvconst->kUVToG[1]); \ - bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ - bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ - br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ + yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \ } // Load YUV 422 pixel data @@ -70,7 +68,7 @@ extern "C" { } // Convert 8 pixels of YUV 420 to RGB. -#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ { \ v8i16 vec0_m, vec1_m; \ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ @@ -89,6 +87,8 @@ extern "C" { reg3_m *= ubvr; \ reg0_m = __msa_srai_w(reg0_m, 16); \ reg1_m = __msa_srai_w(reg1_m, 16); \ + reg0_m += yb; \ + reg1_m += yb; \ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ @@ -102,12 +102,6 @@ extern "C" { reg3_m = reg1_m - reg3_m; \ reg7_m = reg0_m - reg7_m; \ reg4_m = reg1_m - reg4_m; \ - reg5_m += bb; \ - reg6_m += bb; \ - reg7_m += bg; \ - reg4_m += bg; \ - reg2_m += br; \ - reg3_m += br; \ reg5_m = __msa_srai_w(reg5_m, 6); \ reg6_m = __msa_srai_w(reg6_m, 6); \ reg7_m = __msa_srai_w(reg7_m, 6); \ @@ -284,6 +278,34 @@ extern "C" { out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ } +#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ + { \ + v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \ + v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \ + _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \ + _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \ + _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \ + _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \ + _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \ + _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \ + _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \ + _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \ + _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \ + _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \ + _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \ + _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \ + _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \ + _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \ + _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \ + _reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \ + _reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \ + _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \ + _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \ + _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \ + } + void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; @@ -389,20 +411,18 @@ void I422ToARGBRow_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_y += 8; src_u += 4; @@ -420,20 +440,18 @@ void I422ToRGBARow_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(alpha, vec0, vec1, vec2, dst_argb); src_y += 8; src_u += 4; @@ -453,12 +471,11 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, int64_t data_a; v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v4i32 zero = {0}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -467,8 +484,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y, READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); STOREARGB(vec0, vec1, vec2, src3, dst_argb); src_y += 8; @@ -489,7 +505,7 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, int64_t data_u, data_v; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 reg0, reg1, reg2, reg3; v2i64 zero = {0}; @@ -498,8 +514,7 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -512,10 +527,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y, src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); - YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec3, vec4, vec5); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5); reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); @@ -542,19 +555,17 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y, int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec2, vec1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec2, vec1); vec0 = __msa_srai_h(vec0, 3); vec1 = __msa_srai_h(vec1, 3); vec2 = __msa_srai_h(vec2, 2); @@ -581,20 +592,18 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y, v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 4); reg1 = (v8u16)__msa_srai_h(vec1, 4); reg2 = (v8u16)__msa_srai_h(vec2, 4); @@ -621,20 +630,18 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y, v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 3); reg1 = (v8u16)__msa_srai_h(vec1, 3); reg2 = (v8u16)__msa_srai_h(vec2, 3); @@ -1676,56 +1683,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v16u8 dst0; - v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); - v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); - v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, reg2, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080); + v16u8 zero = (v16u8)__msa_ldi_b(0); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0); - src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); - reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); - reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); - reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); - reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); - reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); - reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); - reg0 *= const_0x19; - reg1 *= const_0x19; - reg2 *= const_0x81; - reg3 *= const_0x81; - reg4 *= const_0x42; - reg5 *= const_0x42; - reg0 += reg2; - reg1 += reg3; - reg0 += reg4; - reg1 += reg5; - reg0 += const_0x1080; - reg1 += const_0x1080; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); - ST_UB(dst0, dst_y); + src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = (v8i16)__msa_maddv_h(const_25, tmpb_r, const_1080); + res1 = (v8i16)__msa_maddv_h(const_25, tmpb_l, const_1080); + res0 = (v8i16)__msa_maddv_h(tmpg_r, const_129, res0); + res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1); + res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0); + res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1); + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); src_argb1555 += 32; dst_y += 16; } @@ -1733,62 +1735,49 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; - v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; - v8u16 reg0, reg1, reg2, reg3, reg4, reg5; - v4u32 res0, res1, res2, res3; - v16u8 dst0; - v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); - v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); - v8i16 const_0x1080 = __msa_fill_h(0x1080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); - v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); + v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr; + v16u8 reg0, reg1, dst; + v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; + v8i16 res0, res1; + v8i16 const_66 = (v8i16)__msa_ldi_h(66); + v8i16 const_129 = (v8i16)__msa_ldi_h(129); + v8i16 const_25 = (v8i16)__msa_ldi_h(25); + v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080); + v16u8 zero = __msa_ldi_b(0); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0); - src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16); - vec0 = src0 & const_0x1F; - vec1 = src0 & const_0x7E0; - vec2 = src0 & const_0xF800; - vec3 = src1 & const_0x1F; - vec4 = src1 & const_0x7E0; - vec5 = src1 & const_0xF800; - reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); - reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); - reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); - reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); - reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); - reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); - reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); - reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); - reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); - reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); - reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); - reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); - vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); - vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); - vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); - vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); - vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); - vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); - vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); - vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); - res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); - res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); - res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); - res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); - res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); - res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); - res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); - res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); - res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); - res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); - res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); - res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); - vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); - vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); - dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - ST_UB(dst0, dst_y); + src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0); + src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb); + tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb); + tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg); + tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg); + tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr); + tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr); + res0 = (v8i16)__msa_maddv_h(const_25, tmpb_r, const_1080); + res1 = (v8i16)__msa_maddv_h(const_25, tmpb_l, const_1080); + res0 = (v8i16)__msa_maddv_h(tmpg_r, const_129, res0); + res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1); + res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0); + res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1); + dst = (v16u8)__msa_pckod_b(res1, res0); + ST_UB(dst, dst_y); src_rgb565 += 32; dst_y += 16; } @@ -1885,69 +1874,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, const uint16_t* s = (const uint16_t*)src_argb1555; const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)s, 0); src1 = (v8u16)__msa_ld_b((void*)s, 16); src2 = (v8u16)__msa_ld_b((void*)t, 0); src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x1F; - vec3 = src1 & const_0x1F; - vec2 += src2 & const_0x1F; - vec3 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); - vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); - vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); - reg0 = vec6 * const_0x70; - reg1 = vec0 * const_0x4A; - reg2 = vec2 * const_0x70; - reg3 = vec0 * const_0x5E; - reg0 += const_0x8080; - reg1 += vec2 * const_0x26; - reg2 += const_0x8080; - reg3 += vec6 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + tmpg = (v16u8)__msa_srli_b(tmp0, 5); + nexg = (v16u8)__msa_srli_b(tmp2, 5); + reg0 = (v16u8)__msa_andi_b(tmp1, 0x03); + reg2 = (v16u8)__msa_andi_b(tmp3, 0x03); + reg0 = (v16u8)__msa_slli_b(reg0, 3); + reg2 = (v16u8)__msa_slli_b(reg2, 3); + tmpg = (v16u8)__msa_or_v(tmpg, reg0); + nexg = (v16u8)__msa_or_v(nexg, reg2); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C); + tmpr = (v16u8)__msa_srli_b(reg1, 2); + nexr = (v16u8)__msa_srli_b(reg3, 2); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_slli_b(tmpg, 3); + reg2 = (v16u8)__msa_slli_b(tmpr, 3); + tmpb = (v16u8)__msa_srli_b(tmpb, 2); + tmpg = (v16u8)__msa_srli_b(tmpg, 2); + tmpr = (v16u8)__msa_srli_b(tmpr, 2); + tmpb = (v16u8)__msa_or_v(reg0, tmpb); + tmpg = (v16u8)__msa_or_v(reg1, tmpg); + tmpr = (v16u8)__msa_or_v(reg2, tmpr); + reg0 = (v16u8)__msa_slli_b(nexb, 3); + reg1 = (v16u8)__msa_slli_b(nexg, 3); + reg2 = (v16u8)__msa_slli_b(nexr, 3); + nexb = (v16u8)__msa_srli_b(nexb, 2); + nexg = (v16u8)__msa_srli_b(nexg, 2); + nexr = (v16u8)__msa_srli_b(nexr, 2); + nexb = (v16u8)__msa_or_v(reg0, nexb); + nexg = (v16u8)__msa_or_v(reg1, nexg); + nexr = (v16u8)__msa_or_v(reg2, nexr); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; @@ -1966,68 +1947,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, const uint16_t* s = (const uint16_t*)src_rgb565; const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); int64_t res0, res1; - v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; - v8u16 vec0, vec1, vec2, vec3, vec4, vec5; - v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); - v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); - v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); - v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); + v16u8 src0, src1, src2, src3, dst; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 reg0, reg1, reg2, reg3; + v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; + v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); + v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); + v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); + v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); + v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); + v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); for (x = 0; x < width; x += 16) { - src0 = (v8u16)__msa_ld_b((void*)s, 0); - src1 = (v8u16)__msa_ld_b((void*)s, 16); - src2 = (v8u16)__msa_ld_b((void*)t, 0); - src3 = (v8u16)__msa_ld_b((void*)t, 16); - vec0 = src0 & const_0x1F; - vec1 = src1 & const_0x1F; - vec0 += src2 & const_0x1F; - vec1 += src3 & const_0x1F; - vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); - vec2 = src0 & const_0x3F; - vec3 = src1 & const_0x3F; - vec2 += src2 & const_0x3F; - vec3 += src3 & const_0x3F; - vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); - src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); - src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); - src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); - src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); - vec4 = src0 & const_0x1F; - vec5 = src1 & const_0x1F; - vec4 += src2 & const_0x1F; - vec5 += src3 & const_0x1F; - vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); - vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); - vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); - vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); - vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); - vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); - vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); - vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); - reg0 = vec3 * const_0x70; - reg1 = vec1 * const_0x4A; - reg2 = vec4 * const_0x70; - reg3 = vec1 * const_0x5E; - reg0 += const_32896; - reg1 += vec4 * const_0x26; - reg2 += const_32896; - reg3 += vec3 * const_0x12; - reg0 -= reg1; - reg2 -= reg3; - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); - dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); - res0 = __msa_copy_u_d((v2i64)dst0, 0); - res1 = __msa_copy_u_d((v2i64)dst0, 1); + src0 = (v16u8)__msa_ld_b((void*)s, 0); + src1 = (v16u8)__msa_ld_b((void*)s, 16); + src2 = (v16u8)__msa_ld_b((void*)t, 0); + src3 = (v16u8)__msa_ld_b((void*)t, 16); + tmp0 = (v16u8)__msa_pckev_b(src1, src0); + tmp1 = (v16u8)__msa_pckod_b(src1, src0); + tmp2 = (v16u8)__msa_pckev_b(src3, src2); + tmp3 = (v16u8)__msa_pckod_b(src3, src2); + tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F); + tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8); + nexb = (v16u8)__msa_andi_b(tmp2, 0x1F); + nexr = (v16u8)__msa_andi_b(tmp3, 0xF8); + reg1 = (v16u8)__msa_andi_b(tmp1, 0x07); + reg3 = (v16u8)__msa_andi_b(tmp3, 0x07); + reg0 = (v16u8)__msa_srli_b(tmp0, 5); + reg1 = (v16u8)__msa_slli_b(reg1, 3); + reg2 = (v16u8)__msa_srli_b(tmp2, 5); + reg3 = (v16u8)__msa_slli_b(reg3, 3); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpb, 3); + reg1 = (v16u8)__msa_srli_b(tmpb, 2); + reg2 = (v16u8)__msa_slli_b(nexb, 3); + reg3 = (v16u8)__msa_srli_b(nexb, 2); + tmpb = (v16u8)__msa_or_v(reg1, reg0); + nexb = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_slli_b(tmpg, 2); + reg1 = (v16u8)__msa_srli_b(tmpg, 4); + reg2 = (v16u8)__msa_slli_b(nexg, 2); + reg3 = (v16u8)__msa_srli_b(nexg, 4); + tmpg = (v16u8)__msa_or_v(reg1, reg0); + nexg = (v16u8)__msa_or_v(reg2, reg3); + reg0 = (v16u8)__msa_srli_b(tmpr, 5); + reg2 = (v16u8)__msa_srli_b(nexr, 5); + tmpr = (v16u8)__msa_or_v(tmpr, reg0); + nexr = (v16u8)__msa_or_v(nexr, reg2); + RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst); + res0 = __msa_copy_u_d((v2i64)dst, 0); + res1 = __msa_copy_u_d((v2i64)dst, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; @@ -2266,13 +2236,12 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 zero = {0}; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2281,8 +2250,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y, val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); @@ -2303,12 +2271,11 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, dst0; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 zero = {0}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2317,8 +2284,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y, val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); vec0 = vec0 >> 3; vec1 = (vec1 >> 2) << 5; vec2 = (vec2 >> 3) << 11; @@ -2339,14 +2305,13 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y, uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v16u8 zero = {0}; v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2356,8 +2321,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y, src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); - YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); @@ -2773,12 +2737,12 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, v16u8 src0, src1, src2, dst0, dst1; v8u16 vec0, vec1, vec2; v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 zero = {0}; + v4i32 const_80 = __msa_fill_w(80); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); for (x = 0; x < width; x += 8) { READI444(src_y, src_u, src_v, src0, src1, src2); @@ -2789,26 +2753,26 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, reg1 *= vec_yg; reg0 = __msa_srai_w(reg0, 16); reg1 = __msa_srai_w(reg1, 16); - reg4 = reg0 + vec_br; - reg5 = reg1 + vec_br; - reg2 = reg0 + vec_bg; - reg3 = reg1 + vec_bg; - reg0 += vec_bb; - reg1 += vec_bb; + reg0 += vec_yb; + reg1 += vec_yb; vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); - reg0 -= reg6 * vec_ub; - reg1 -= reg7 * vec_ub; - reg2 -= reg6 * vec_ug; - reg3 -= reg7 * vec_ug; - reg4 -= reg8 * vec_vr; - reg5 -= reg9 * vec_vr; - reg2 -= reg8 * vec_vg; - reg3 -= reg9 * vec_vg; + reg6 -= const_80; + reg7 -= const_80; + reg8 -= const_80; + reg9 -= const_80; + reg0 += reg6 * vec_ub; + reg1 += reg7 * vec_ub; + reg2 += reg6 * vec_ug; + reg3 += reg7 * vec_ug; + reg4 += reg8 * vec_vr; + reg5 += reg9 * vec_vr; + reg2 += reg8 * vec_vg; + reg3 += reg9 * vec_vg; reg0 = __msa_srai_w(reg0, 6); reg1 = __msa_srai_w(reg1, 6); reg2 = __msa_srai_w(reg2, 6); @@ -2922,12 +2886,11 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2935,8 +2898,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_yuy2 += 16; dst_argb += 32; @@ -2950,12 +2912,11 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; - v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); - YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg); + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); @@ -2963,8 +2924,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); - YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, - vec0, vec1, vec2); + YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_uyvy += 16; dst_argb += 32;