Fix bugs on mips platform.

This patch fixes compilation errors caused by the removal of kUVBias
and two failed test cases of LibYUVConvertTest.RGB565ToI420_Opt and
LibYUVConvertTest.ARGB1555ToI420_Opt.

Bug: libyuv:918
Change-Id: I1a66bcd7ef616aacbeca5b4015013015ccdf0f18
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3477416
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Hao Chen 2022-02-21 20:33:29 +08:00 committed by libyuv LUCI CQ
parent b4ddbaf549
commit 3b8c86d23a

View File

@ -24,16 +24,14 @@ extern "C" {
#define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
{ \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \
bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \
bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \
br = __msa_fill_w(yuvconst->kUVBiasR[0]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
}
// Load YUV 422 pixel data
@ -70,7 +68,7 @@ extern "C" {
}
// Convert 8 pixels of YUV 420 to RGB.
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
{ \
v8i16 vec0_m, vec1_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
@ -89,6 +87,8 @@ extern "C" {
reg3_m *= ubvr; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg0_m += yb; \
reg1_m += yb; \
reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
@ -102,12 +102,6 @@ extern "C" {
reg3_m = reg1_m - reg3_m; \
reg7_m = reg0_m - reg7_m; \
reg4_m = reg1_m - reg4_m; \
reg5_m += bb; \
reg6_m += bb; \
reg7_m += bg; \
reg4_m += bg; \
reg2_m += br; \
reg3_m += br; \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg6_m = __msa_srai_w(reg6_m, 6); \
reg7_m = __msa_srai_w(reg7_m, 6); \
@ -284,6 +278,34 @@ extern "C" {
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
}
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
{ \
v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
_tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
_tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
_tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
_tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
_tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
_tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
_reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
_reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
_reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
_reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
_reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
_reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
_reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
_reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
_reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
_reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \
_reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
_dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
}
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
int x;
v16u8 src0, src1, src2, src3;
@ -389,20 +411,18 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_y += 8;
src_u += 4;
@ -420,20 +440,18 @@ void I422ToRGBARow_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
src_y += 8;
src_u += 4;
@ -453,12 +471,11 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
int64_t data_a;
v16u8 src0, src1, src2, src3;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v4i32 zero = {0};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -467,8 +484,7 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
STOREARGB(vec0, vec1, vec2, src3, dst_argb);
src_y += 8;
@ -489,7 +505,7 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
int64_t data_u, data_v;
v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 reg0, reg1, reg2, reg3;
v2i64 zero = {0};
@ -498,8 +514,7 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10,
11, 29, 12, 13, 30, 14, 15, 31};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -512,10 +527,8 @@ void I422ToRGB24Row_MSA(const uint8_t* src_y,
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec3, vec4, vec5);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec3, vec4, vec5);
reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
@ -542,19 +555,17 @@ void I422ToRGB565Row_MSA(const uint8_t* src_y,
int x;
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec2, vec1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec2, vec1);
vec0 = __msa_srai_h(vec0, 3);
vec1 = __msa_srai_h(vec1, 3);
vec2 = __msa_srai_h(vec2, 2);
@ -581,20 +592,18 @@ void I422ToARGB4444Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
reg0 = (v8u16)__msa_srai_h(vec0, 4);
reg1 = (v8u16)__msa_srai_h(vec1, 4);
reg2 = (v8u16)__msa_srai_h(vec2, 4);
@ -621,20 +630,18 @@ void I422ToARGB1555Row_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0;
v8i16 vec0, vec1, vec2;
v8u16 reg0, reg1, reg2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
for (x = 0; x < width; x += 8) {
READYUV422(src_y, src_u, src_v, src0, src1, src2);
src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
reg0 = (v8u16)__msa_srai_h(vec0, 3);
reg1 = (v8u16)__msa_srai_h(vec1, 3);
reg2 = (v8u16)__msa_srai_h(vec2, 3);
@ -1676,56 +1683,51 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
int x;
v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
v16u8 dst0;
v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
v16u8 reg0, reg1, reg2, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
v16u8 zero = (v16u8)__msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0);
src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
vec2 = src0 & const_0x1F;
vec3 = src1 & const_0x1F;
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
vec4 = src0 & const_0x1F;
vec5 = src1 & const_0x1F;
reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
reg0 *= const_0x19;
reg1 *= const_0x19;
reg2 *= const_0x81;
reg3 *= const_0x81;
reg4 *= const_0x42;
reg5 *= const_0x42;
reg0 += reg2;
reg1 += reg3;
reg0 += reg4;
reg1 += reg5;
reg0 += const_0x1080;
reg1 += const_0x1080;
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_y);
src0 = (v16u8)__msa_ld_b((void*)src_argb1555, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb1555, 16);
tmp0 = (v16u8)__msa_pckev_b(src1, src0);
tmp1 = (v16u8)__msa_pckod_b(src1, src0);
tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
tmpg = (v16u8)__msa_srli_b(tmp0, 5);
reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
reg0 = (v16u8)__msa_slli_b(reg0, 3);
tmpg = (v16u8)__msa_or_v(tmpg, reg0);
reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
tmpr = (v16u8)__msa_srli_b(reg1, 2);
reg0 = (v16u8)__msa_slli_b(tmpb, 3);
reg1 = (v16u8)__msa_slli_b(tmpg, 3);
reg2 = (v16u8)__msa_slli_b(tmpr, 3);
tmpb = (v16u8)__msa_srli_b(tmpb, 2);
tmpg = (v16u8)__msa_srli_b(tmpg, 2);
tmpr = (v16u8)__msa_srli_b(tmpr, 2);
tmpb = (v16u8)__msa_or_v(reg0, tmpb);
tmpg = (v16u8)__msa_or_v(reg1, tmpg);
tmpr = (v16u8)__msa_or_v(reg2, tmpr);
tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
res0 = (v8i16)__msa_maddv_h(const_25, tmpb_r, const_1080);
res1 = (v8i16)__msa_maddv_h(const_25, tmpb_l, const_1080);
res0 = (v8i16)__msa_maddv_h(tmpg_r, const_129, res0);
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y);
src_argb1555 += 32;
dst_y += 16;
}
@ -1733,62 +1735,49 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
int x;
v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
v4u32 res0, res1, res2, res3;
v16u8 dst0;
v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
v8i16 const_0x1080 = __msa_fill_h(0x1080);
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
v16u8 src0, src1, tmp0, tmp1, tmpb, tmpg, tmpr;
v16u8 reg0, reg1, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
v16u8 zero = __msa_ldi_b(0);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0);
src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16);
vec0 = src0 & const_0x1F;
vec1 = src0 & const_0x7E0;
vec2 = src0 & const_0xF800;
vec3 = src1 & const_0x1F;
vec4 = src1 & const_0x7E0;
vec5 = src1 & const_0xF800;
reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
src0 = (v16u8)__msa_ld_b((void*)src_rgb565, 0);
src1 = (v16u8)__msa_ld_b((void*)src_rgb565, 16);
tmp0 = (v16u8)__msa_pckev_b(src1, src0);
tmp1 = (v16u8)__msa_pckod_b(src1, src0);
tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
reg0 = (v16u8)__msa_srli_b(tmp0, 5);
reg1 = (v16u8)__msa_slli_b(reg1, 3);
tmpg = (v16u8)__msa_or_v(reg1, reg0);
reg0 = (v16u8)__msa_slli_b(tmpb, 3);
reg1 = (v16u8)__msa_srli_b(tmpb, 2);
tmpb = (v16u8)__msa_or_v(reg1, reg0);
reg0 = (v16u8)__msa_slli_b(tmpg, 2);
reg1 = (v16u8)__msa_srli_b(tmpg, 4);
tmpg = (v16u8)__msa_or_v(reg1, reg0);
reg0 = (v16u8)__msa_srli_b(tmpr, 5);
tmpr = (v16u8)__msa_or_v(tmpr, reg0);
tmpb_r = (v8i16)__msa_ilvr_b(zero, tmpb);
tmpb_l = (v8i16)__msa_ilvl_b(zero, tmpb);
tmpg_r = (v8i16)__msa_ilvr_b(zero, tmpg);
tmpg_l = (v8i16)__msa_ilvl_b(zero, tmpg);
tmpr_r = (v8i16)__msa_ilvr_b(zero, tmpr);
tmpr_l = (v8i16)__msa_ilvl_b(zero, tmpr);
res0 = (v8i16)__msa_maddv_h(const_25, tmpb_r, const_1080);
res1 = (v8i16)__msa_maddv_h(const_25, tmpb_l, const_1080);
res0 = (v8i16)__msa_maddv_h(tmpg_r, const_129, res0);
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y);
src_rgb565 += 32;
dst_y += 16;
}
@ -1885,69 +1874,61 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
const uint16_t* s = (const uint16_t*)src_argb1555;
const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
int64_t res0, res1;
v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
v16u8 dst0;
v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
v16u8 src0, src1, src2, src3, dst;
v16u8 tmp0, tmp1, tmp2, tmp3;
v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)s, 0);
src1 = (v8u16)__msa_ld_b((void*)s, 16);
src2 = (v8u16)__msa_ld_b((void*)t, 0);
src3 = (v8u16)__msa_ld_b((void*)t, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
vec0 += src2 & const_0x1F;
vec1 += src3 & const_0x1F;
vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
vec2 = src0 & const_0x1F;
vec3 = src1 & const_0x1F;
vec2 += src2 & const_0x1F;
vec3 += src3 & const_0x1F;
vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
vec4 = src0 & const_0x1F;
vec5 = src1 & const_0x1F;
vec4 += src2 & const_0x1F;
vec5 += src3 & const_0x1F;
vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
reg0 = vec6 * const_0x70;
reg1 = vec0 * const_0x4A;
reg2 = vec2 * const_0x70;
reg3 = vec0 * const_0x5E;
reg0 += const_0x8080;
reg1 += vec2 * const_0x26;
reg2 += const_0x8080;
reg3 += vec6 * const_0x12;
reg0 -= reg1;
reg2 -= reg3;
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
res0 = __msa_copy_u_d((v2i64)dst0, 0);
res1 = __msa_copy_u_d((v2i64)dst0, 1);
tmp0 = (v16u8)__msa_pckev_b(src1, src0);
tmp1 = (v16u8)__msa_pckod_b(src1, src0);
tmp2 = (v16u8)__msa_pckev_b(src3, src2);
tmp3 = (v16u8)__msa_pckod_b(src3, src2);
tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
tmpg = (v16u8)__msa_srli_b(tmp0, 5);
nexg = (v16u8)__msa_srli_b(tmp2, 5);
reg0 = (v16u8)__msa_andi_b(tmp1, 0x03);
reg2 = (v16u8)__msa_andi_b(tmp3, 0x03);
reg0 = (v16u8)__msa_slli_b(reg0, 3);
reg2 = (v16u8)__msa_slli_b(reg2, 3);
tmpg = (v16u8)__msa_or_v(tmpg, reg0);
nexg = (v16u8)__msa_or_v(nexg, reg2);
reg1 = (v16u8)__msa_andi_b(tmp1, 0x7C);
reg3 = (v16u8)__msa_andi_b(tmp3, 0x7C);
tmpr = (v16u8)__msa_srli_b(reg1, 2);
nexr = (v16u8)__msa_srli_b(reg3, 2);
reg0 = (v16u8)__msa_slli_b(tmpb, 3);
reg1 = (v16u8)__msa_slli_b(tmpg, 3);
reg2 = (v16u8)__msa_slli_b(tmpr, 3);
tmpb = (v16u8)__msa_srli_b(tmpb, 2);
tmpg = (v16u8)__msa_srli_b(tmpg, 2);
tmpr = (v16u8)__msa_srli_b(tmpr, 2);
tmpb = (v16u8)__msa_or_v(reg0, tmpb);
tmpg = (v16u8)__msa_or_v(reg1, tmpg);
tmpr = (v16u8)__msa_or_v(reg2, tmpr);
reg0 = (v16u8)__msa_slli_b(nexb, 3);
reg1 = (v16u8)__msa_slli_b(nexg, 3);
reg2 = (v16u8)__msa_slli_b(nexr, 3);
nexb = (v16u8)__msa_srli_b(nexb, 2);
nexg = (v16u8)__msa_srli_b(nexg, 2);
nexr = (v16u8)__msa_srli_b(nexr, 2);
nexb = (v16u8)__msa_or_v(reg0, nexb);
nexg = (v16u8)__msa_or_v(reg1, nexg);
nexr = (v16u8)__msa_or_v(reg2, nexr);
RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
res0 = __msa_copy_u_d((v2i64)dst, 0);
res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@ -1966,68 +1947,57 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
const uint16_t* s = (const uint16_t*)src_rgb565;
const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
int64_t res0, res1;
v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
v16u8 dst0;
v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
v16u8 src0, src1, src2, src3, dst;
v16u8 tmp0, tmp1, tmp2, tmp3;
v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
src0 = (v8u16)__msa_ld_b((void*)s, 0);
src1 = (v8u16)__msa_ld_b((void*)s, 16);
src2 = (v8u16)__msa_ld_b((void*)t, 0);
src3 = (v8u16)__msa_ld_b((void*)t, 16);
vec0 = src0 & const_0x1F;
vec1 = src1 & const_0x1F;
vec0 += src2 & const_0x1F;
vec1 += src3 & const_0x1F;
vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
vec2 = src0 & const_0x3F;
vec3 = src1 & const_0x3F;
vec2 += src2 & const_0x3F;
vec3 += src3 & const_0x3F;
vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
vec4 = src0 & const_0x1F;
vec5 = src1 & const_0x1F;
vec4 += src2 & const_0x1F;
vec5 += src3 & const_0x1F;
vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
reg0 = vec3 * const_0x70;
reg1 = vec1 * const_0x4A;
reg2 = vec4 * const_0x70;
reg3 = vec1 * const_0x5E;
reg0 += const_32896;
reg1 += vec4 * const_0x26;
reg2 += const_32896;
reg3 += vec3 * const_0x12;
reg0 -= reg1;
reg2 -= reg3;
reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
res0 = __msa_copy_u_d((v2i64)dst0, 0);
res1 = __msa_copy_u_d((v2i64)dst0, 1);
src0 = (v16u8)__msa_ld_b((void*)s, 0);
src1 = (v16u8)__msa_ld_b((void*)s, 16);
src2 = (v16u8)__msa_ld_b((void*)t, 0);
src3 = (v16u8)__msa_ld_b((void*)t, 16);
tmp0 = (v16u8)__msa_pckev_b(src1, src0);
tmp1 = (v16u8)__msa_pckod_b(src1, src0);
tmp2 = (v16u8)__msa_pckev_b(src3, src2);
tmp3 = (v16u8)__msa_pckod_b(src3, src2);
tmpb = (v16u8)__msa_andi_b(tmp0, 0x1F);
tmpr = (v16u8)__msa_andi_b(tmp1, 0xF8);
nexb = (v16u8)__msa_andi_b(tmp2, 0x1F);
nexr = (v16u8)__msa_andi_b(tmp3, 0xF8);
reg1 = (v16u8)__msa_andi_b(tmp1, 0x07);
reg3 = (v16u8)__msa_andi_b(tmp3, 0x07);
reg0 = (v16u8)__msa_srli_b(tmp0, 5);
reg1 = (v16u8)__msa_slli_b(reg1, 3);
reg2 = (v16u8)__msa_srli_b(tmp2, 5);
reg3 = (v16u8)__msa_slli_b(reg3, 3);
tmpg = (v16u8)__msa_or_v(reg1, reg0);
nexg = (v16u8)__msa_or_v(reg2, reg3);
reg0 = (v16u8)__msa_slli_b(tmpb, 3);
reg1 = (v16u8)__msa_srli_b(tmpb, 2);
reg2 = (v16u8)__msa_slli_b(nexb, 3);
reg3 = (v16u8)__msa_srli_b(nexb, 2);
tmpb = (v16u8)__msa_or_v(reg1, reg0);
nexb = (v16u8)__msa_or_v(reg2, reg3);
reg0 = (v16u8)__msa_slli_b(tmpg, 2);
reg1 = (v16u8)__msa_srli_b(tmpg, 4);
reg2 = (v16u8)__msa_slli_b(nexg, 2);
reg3 = (v16u8)__msa_srli_b(nexg, 4);
tmpg = (v16u8)__msa_or_v(reg1, reg0);
nexg = (v16u8)__msa_or_v(reg2, reg3);
reg0 = (v16u8)__msa_srli_b(tmpr, 5);
reg2 = (v16u8)__msa_srli_b(nexr, 5);
tmpr = (v16u8)__msa_or_v(tmpr, reg0);
nexr = (v16u8)__msa_or_v(nexr, reg2);
RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, dst);
res0 = __msa_copy_u_d((v2i64)dst, 0);
res1 = __msa_copy_u_d((v2i64)dst, 1);
SD(res0, dst_u);
SD(res1, dst_v);
s += 16;
@ -2266,13 +2236,12 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 zero = {0};
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -2281,8 +2250,7 @@ void NV12ToARGBRow_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@ -2303,12 +2271,11 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, dst0;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 zero = {0};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -2317,8 +2284,7 @@ void NV12ToRGB565Row_MSA(const uint8_t* src_y,
val1 = LD(src_uv);
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
vec0 = vec0 >> 3;
vec1 = (vec1 >> 2) << 5;
vec2 = (vec2 >> 3) << 11;
@ -2339,14 +2305,13 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
uint64_t val0, val1;
v16u8 src0, src1, res0, res1, dst0, dst1;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v16u8 zero = {0};
v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -2356,8 +2321,7 @@ void NV21ToARGBRow_MSA(const uint8_t* src_y,
src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
@ -2773,12 +2737,12 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
v16u8 src0, src1, src2, dst0, dst1;
v8u16 vec0, vec1, vec2;
v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
v8i16 zero = {0};
v4i32 const_80 = __msa_fill_w(80);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
for (x = 0; x < width; x += 8) {
READI444(src_y, src_u, src_v, src0, src1, src2);
@ -2789,26 +2753,26 @@ void I444ToARGBRow_MSA(const uint8_t* src_y,
reg1 *= vec_yg;
reg0 = __msa_srai_w(reg0, 16);
reg1 = __msa_srai_w(reg1, 16);
reg4 = reg0 + vec_br;
reg5 = reg1 + vec_br;
reg2 = reg0 + vec_bg;
reg3 = reg1 + vec_bg;
reg0 += vec_bb;
reg1 += vec_bb;
reg0 += vec_yb;
reg1 += vec_yb;
vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
reg0 -= reg6 * vec_ub;
reg1 -= reg7 * vec_ub;
reg2 -= reg6 * vec_ug;
reg3 -= reg7 * vec_ug;
reg4 -= reg8 * vec_vr;
reg5 -= reg9 * vec_vr;
reg2 -= reg8 * vec_vg;
reg3 -= reg9 * vec_vg;
reg6 -= const_80;
reg7 -= const_80;
reg8 -= const_80;
reg9 -= const_80;
reg0 += reg6 * vec_ub;
reg1 += reg7 * vec_ub;
reg2 += reg6 * vec_ug;
reg3 += reg7 * vec_ug;
reg4 += reg8 * vec_vr;
reg5 += reg9 * vec_vr;
reg2 += reg8 * vec_vg;
reg3 += reg9 * vec_vg;
reg0 = __msa_srai_w(reg0, 6);
reg1 = __msa_srai_w(reg1, 6);
reg2 = __msa_srai_w(reg2, 6);
@ -2922,12 +2886,11 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -2935,8 +2898,7 @@ void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0);
src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_yuy2 += 16;
dst_argb += 32;
@ -2950,12 +2912,11 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
int x;
v16u8 src0, src1, src2;
v8i16 vec0, vec1, vec2;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb;
v4i32 vec_ubvr, vec_ugvg;
v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
vec_br, vec_yg);
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
@ -2963,8 +2924,7 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0);
src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
vec0, vec1, vec2);
YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_yg, vec_yb, vec0, vec1, vec2);
STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
src_uyvy += 16;
dst_argb += 32;