mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
loong64: UV subsample's 4-pixel rounding average and ARGBToJ444 fixed-point scaling
The UV subsample's 4-pixel rounding average and ARGBToJ444 fixed-point scaling were updated in d32d19cc and c060118b. The LoongArch optimization is updated now. Bug: 381138208 Change-Id: I3585d72564e4fffe514599b1a9b4fee8fbbd0266 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6878364 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
parent
70458840c4
commit
eb4e4736a4
@ -193,18 +193,30 @@ extern "C" {
|
||||
|
||||
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
|
||||
{ \
|
||||
__m256i _tmp0, _tmp1, _tmp2, _tmp3; \
|
||||
__m256i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
|
||||
_tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb); \
|
||||
_tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb); \
|
||||
_tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg); \
|
||||
_tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg); \
|
||||
_reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr); \
|
||||
_reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr); \
|
||||
_tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1); \
|
||||
_tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3); \
|
||||
_tmpr = __lasx_xvavgr_hu(_reg0, _reg1); \
|
||||
_reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb); \
|
||||
_reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr); \
|
||||
_tmp4 = __lasx_xvaddwev_w_hu(_tmp0, _tmp1); \
|
||||
_tmp5 = __lasx_xvaddwod_w_hu(_tmp0, _tmp1); \
|
||||
_tmp0 = __lasx_xvilvl_w(_tmp5, _tmp4); \
|
||||
_tmp1 = __lasx_xvilvh_w(_tmp5, _tmp4); \
|
||||
_tmpb = __lasx_xvssrarni_hu_w(_tmp1, _tmp0, 2); \
|
||||
_tmp4 = __lasx_xvaddwev_w_hu(_tmp2, _tmp3); \
|
||||
_tmp5 = __lasx_xvaddwod_w_hu(_tmp2, _tmp3); \
|
||||
_tmp2 = __lasx_xvilvl_w(_tmp5, _tmp4); \
|
||||
_tmp3 = __lasx_xvilvh_w(_tmp5, _tmp4); \
|
||||
_tmpg = __lasx_xvssrarni_hu_w(_tmp3, _tmp2, 2); \
|
||||
_tmp4 = __lasx_xvaddwev_w_hu(_reg0, _reg1); \
|
||||
_tmp5 = __lasx_xvaddwod_w_hu(_reg0, _reg1); \
|
||||
_tmp0 = __lasx_xvilvl_w(_tmp5, _tmp4); \
|
||||
_tmp1 = __lasx_xvilvh_w(_tmp5, _tmp4); \
|
||||
_tmpr = __lasx_xvssrarni_hu_w(_tmp1, _tmp0, 2); \
|
||||
_reg0 = __lasx_xvmadd_h(const_8000, const_112, _tmpb); \
|
||||
_reg1 = __lasx_xvmadd_h(const_8000, const_112, _tmpr); \
|
||||
_reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg); \
|
||||
_reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg); \
|
||||
_reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr); \
|
||||
@ -787,20 +799,16 @@ void ARGBToUVRow_LASX(const uint8_t* src_argb0,
|
||||
__m256i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m256i vec0, vec1, vec2, vec3;
|
||||
__m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
|
||||
__m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
|
||||
0x0038003800380038, 0x0038003800380038};
|
||||
__m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
|
||||
0x0025002500250025, 0x0025002500250025};
|
||||
__m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
|
||||
0x0013001300130013, 0x0013001300130013};
|
||||
__m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
|
||||
0x002f002f002f002f, 0x002f002f002f002f};
|
||||
__m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
|
||||
0x0009000900090009, 0x0009000900090009};
|
||||
__m256i const_0x70 = __lasx_xvldi(0x470);
|
||||
__m256i const_0x4A = __lasx_xvldi(0x44A);
|
||||
__m256i const_0x26 = __lasx_xvldi(0x426);
|
||||
__m256i const_0x5E = __lasx_xvldi(0x45E);
|
||||
__m256i const_0x12 = __lasx_xvldi(0x412);
|
||||
|
||||
__m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
|
||||
0x0000000700000003};
|
||||
__m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_0x8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
|
||||
@ -823,17 +831,20 @@ void ARGBToUVRow_LASX(const uint8_t* src_argb0,
|
||||
tmp5 = __lasx_xvpickev_h(vec3, vec2);
|
||||
vec0 = __lasx_xvpickev_h(tmp1, tmp0);
|
||||
vec1 = __lasx_xvpickod_h(tmp1, tmp0);
|
||||
src0 = __lasx_xvavgr_h(vec0, vec1);
|
||||
src0 = __lasx_xvadd_h(vec0, vec1);
|
||||
src0 = __lasx_xvsrari_h(src0, 2);
|
||||
vec0 = __lasx_xvpickev_h(tmp3, tmp2);
|
||||
vec1 = __lasx_xvpickod_h(tmp3, tmp2);
|
||||
src1 = __lasx_xvavgr_h(vec0, vec1);
|
||||
src1 = __lasx_xvadd_h(vec0, vec1);
|
||||
src1 = __lasx_xvsrari_h(src1, 2);
|
||||
vec0 = __lasx_xvpickev_h(tmp5, tmp4);
|
||||
vec1 = __lasx_xvpickod_h(tmp5, tmp4);
|
||||
src2 = __lasx_xvavgr_h(vec0, vec1);
|
||||
dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
|
||||
src2 = __lasx_xvadd_h(vec0, vec1);
|
||||
src2 = __lasx_xvsrari_h(src2, 2);
|
||||
dst0 = __lasx_xvmadd_h(const_0x8000, src0, const_0x70);
|
||||
dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
|
||||
dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
|
||||
dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
|
||||
dst1 = __lasx_xvmadd_h(const_0x8000, src1, const_0x70);
|
||||
dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
|
||||
dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
|
||||
dst0 = __lasx_xvperm_w(dst0, control);
|
||||
@ -1037,8 +1048,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
|
||||
__m256i const_38 = __lasx_xvldi(38);
|
||||
__m256i const_94 = __lasx_xvldi(94);
|
||||
__m256i const_18 = __lasx_xvldi(18);
|
||||
__m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_0x8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
__m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
|
||||
0x0000000700000003};
|
||||
for (x = 0; x < len; x++) {
|
||||
@ -1048,8 +1059,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
|
||||
tmp1 = __lasx_xvpickod_h(src1, src0);
|
||||
tmp2 = __lasx_xvpickev_h(src3, src2);
|
||||
tmp3 = __lasx_xvpickod_h(src3, src2);
|
||||
reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
|
||||
reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
|
||||
reg0 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp0, const_112);
|
||||
reg1 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp2, const_112);
|
||||
reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
|
||||
reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
|
||||
reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
|
||||
@ -1058,8 +1069,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
|
||||
reg1 = __lasx_xvsub_h(reg1, reg3);
|
||||
dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
|
||||
dst0 = __lasx_xvperm_w(dst0, control);
|
||||
reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
|
||||
reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
|
||||
reg0 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp1, const_112);
|
||||
reg1 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp3, const_112);
|
||||
reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
|
||||
reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
|
||||
reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
|
||||
@ -1606,13 +1617,13 @@ void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
|
||||
__m256i tmp0, tmp1, tmp2, tmp3;
|
||||
__m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m256i reg0, reg1, reg2, reg3, dst0;
|
||||
__m256i const_112 = __lasx_xvldi(0x438);
|
||||
__m256i const_74 = __lasx_xvldi(0x425);
|
||||
__m256i const_38 = __lasx_xvldi(0x413);
|
||||
__m256i const_94 = __lasx_xvldi(0x42F);
|
||||
__m256i const_18 = __lasx_xvldi(0x409);
|
||||
__m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_112 = __lasx_xvldi(0x470);
|
||||
__m256i const_74 = __lasx_xvldi(0x44A);
|
||||
__m256i const_38 = __lasx_xvldi(0x426);
|
||||
__m256i const_94 = __lasx_xvldi(0x45E);
|
||||
__m256i const_18 = __lasx_xvldi(0x412);
|
||||
__m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
|
||||
@ -1723,13 +1734,13 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
|
||||
__m256i tmp0, tmp1, tmp2, tmp3;
|
||||
__m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m256i reg0, reg1, reg2, reg3, dst0;
|
||||
__m256i const_112 = __lasx_xvldi(0x438);
|
||||
__m256i const_74 = __lasx_xvldi(0x425);
|
||||
__m256i const_38 = __lasx_xvldi(0x413);
|
||||
__m256i const_94 = __lasx_xvldi(0x42F);
|
||||
__m256i const_18 = __lasx_xvldi(0x409);
|
||||
__m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_112 = __lasx_xvldi(0x470);
|
||||
__m256i const_74 = __lasx_xvldi(0x44A);
|
||||
__m256i const_38 = __lasx_xvldi(0x426);
|
||||
__m256i const_94 = __lasx_xvldi(0x45E);
|
||||
__m256i const_18 = __lasx_xvldi(0x412);
|
||||
__m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
|
||||
@ -1790,13 +1801,13 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
|
||||
__m256i src0, src1, src2, reg0, reg1, reg2;
|
||||
__m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
|
||||
__m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m256i const_112 = __lasx_xvldi(0x438);
|
||||
__m256i const_74 = __lasx_xvldi(0x425);
|
||||
__m256i const_38 = __lasx_xvldi(0x413);
|
||||
__m256i const_94 = __lasx_xvldi(0x42F);
|
||||
__m256i const_18 = __lasx_xvldi(0x409);
|
||||
__m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_112 = __lasx_xvldi(0x470);
|
||||
__m256i const_74 = __lasx_xvldi(0x44A);
|
||||
__m256i const_38 = __lasx_xvldi(0x426);
|
||||
__m256i const_94 = __lasx_xvldi(0x45E);
|
||||
__m256i const_18 = __lasx_xvldi(0x412);
|
||||
__m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
__m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
|
||||
0x15120F0C09060300, 0x00000000001E1B18};
|
||||
__m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
|
||||
@ -1853,13 +1864,13 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
|
||||
__m256i src0, src1, src2, reg0, reg1, reg2;
|
||||
__m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
|
||||
__m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m256i const_112 = __lasx_xvldi(0x438);
|
||||
__m256i const_74 = __lasx_xvldi(0x425);
|
||||
__m256i const_38 = __lasx_xvldi(0x413);
|
||||
__m256i const_94 = __lasx_xvldi(0x42F);
|
||||
__m256i const_18 = __lasx_xvldi(0x409);
|
||||
__m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_112 = __lasx_xvldi(0x470);
|
||||
__m256i const_74 = __lasx_xvldi(0x44A);
|
||||
__m256i const_38 = __lasx_xvldi(0x426);
|
||||
__m256i const_94 = __lasx_xvldi(0x45E);
|
||||
__m256i const_18 = __lasx_xvldi(0x412);
|
||||
__m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
__m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
|
||||
0x15120F0C09060300, 0x00000000001E1B18};
|
||||
__m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
|
||||
@ -2239,16 +2250,16 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
|
||||
int len = width / 32;
|
||||
__m256i src0, src1, src2, src3;
|
||||
__m256i nex0, nex1, nex2, nex3;
|
||||
__m256i tmp0, tmp1, tmp2, tmp3;
|
||||
__m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
__m256i reg0, reg1, dst0;
|
||||
__m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m256i const_63 = __lasx_xvldi(0x43F);
|
||||
__m256i const_42 = __lasx_xvldi(0x42A);
|
||||
__m256i const_128 = __lasx_xvldi(0x480);
|
||||
__m256i const_85 = __lasx_xvldi(0x455);
|
||||
__m256i const_43 = __lasx_xvldi(0x42B);
|
||||
__m256i const_107 = __lasx_xvldi(0x46B);
|
||||
__m256i const_21 = __lasx_xvldi(0x415);
|
||||
__m256i const_53 = __lasx_xvldi(0x435);
|
||||
__m256i const_10 = __lasx_xvldi(0x40A);
|
||||
__m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
|
||||
0x8080808080808080, 0x8080808080808080};
|
||||
__m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
|
||||
0x8000800080008000, 0x8000800080008000};
|
||||
__m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
|
||||
0x1F1D0F0D1B190B09};
|
||||
|
||||
@ -2277,15 +2288,27 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
|
||||
tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
|
||||
reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
|
||||
reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
|
||||
tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
|
||||
tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
|
||||
tmpr = __lasx_xvavgr_hu(reg0, reg1);
|
||||
reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
|
||||
reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
|
||||
reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
|
||||
reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
|
||||
reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
|
||||
reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
|
||||
tmp4 = __lasx_xvaddwev_w_hu(tmp0, tmp1);
|
||||
tmp5 = __lasx_xvaddwod_w_hu(tmp0, tmp1);
|
||||
tmp0 = __lasx_xvilvl_w(tmp5, tmp4);
|
||||
tmp1 = __lasx_xvilvh_w(tmp5, tmp4);
|
||||
tmpb = __lasx_xvssrarni_hu_w(tmp1, tmp0, 2);
|
||||
tmp4 = __lasx_xvaddwev_w_hu(tmp2, tmp3);
|
||||
tmp5 = __lasx_xvaddwod_w_hu(tmp2, tmp3);
|
||||
tmp2 = __lasx_xvilvl_w(tmp5, tmp4);
|
||||
tmp3 = __lasx_xvilvh_w(tmp5, tmp4);
|
||||
tmpg = __lasx_xvssrarni_hu_w(tmp3, tmp2, 2);
|
||||
tmp4 = __lasx_xvaddwev_w_hu(reg0, reg1);
|
||||
tmp5 = __lasx_xvaddwod_w_hu(reg0, reg1);
|
||||
tmp0 = __lasx_xvilvl_w(tmp5, tmp4);
|
||||
tmp1 = __lasx_xvilvh_w(tmp5, tmp4);
|
||||
tmpr = __lasx_xvssrarni_hu_w(tmp1, tmp0, 2);
|
||||
reg0 = __lasx_xvmadd_h(const_8000, const_128, tmpb);
|
||||
reg1 = __lasx_xvmadd_h(const_8000, const_128, tmpr);
|
||||
reg0 = __lasx_xvmsub_h(reg0, const_85, tmpg);
|
||||
reg1 = __lasx_xvmsub_h(reg1, const_107, tmpg);
|
||||
reg0 = __lasx_xvmsub_h(reg0, const_43, tmpr);
|
||||
reg1 = __lasx_xvmsub_h(reg1, const_21, tmpb);
|
||||
dst0 = __lasx_xvpackod_b(reg1, reg0);
|
||||
tmp0 = __lasx_xvpermi_d(dst0, 0x44);
|
||||
tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
|
||||
|
||||
@ -239,7 +239,7 @@ extern "C" {
|
||||
|
||||
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
|
||||
{ \
|
||||
__m128i _tmp0, _tmp1, _tmp2, _tmp3; \
|
||||
__m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
|
||||
__m128i _reg0, _reg1; \
|
||||
_tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb); \
|
||||
_tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb); \
|
||||
@ -247,11 +247,23 @@ extern "C" {
|
||||
_tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg); \
|
||||
_reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr); \
|
||||
_reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr); \
|
||||
_tmpb = __lsx_vavgr_hu(_tmp0, _tmp1); \
|
||||
_tmpg = __lsx_vavgr_hu(_tmp2, _tmp3); \
|
||||
_tmpr = __lsx_vavgr_hu(_reg0, _reg1); \
|
||||
_reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb); \
|
||||
_reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr); \
|
||||
_tmp4 = __lsx_vaddwev_w_hu(_tmp0, _tmp1); \
|
||||
_tmp5 = __lsx_vaddwod_w_hu(_tmp0, _tmp1); \
|
||||
_tmp0 = __lsx_vilvl_w(_tmp5, _tmp4); \
|
||||
_tmp1 = __lsx_vilvh_w(_tmp5, _tmp4); \
|
||||
_tmpb = __lsx_vssrarni_hu_w(_tmp1, _tmp0, 2); \
|
||||
_tmp4 = __lsx_vaddwev_w_hu(_tmp2, _tmp3); \
|
||||
_tmp5 = __lsx_vaddwod_w_hu(_tmp2, _tmp3); \
|
||||
_tmp2 = __lsx_vilvl_w(_tmp5, _tmp4); \
|
||||
_tmp3 = __lsx_vilvh_w(_tmp5, _tmp4); \
|
||||
_tmpg = __lsx_vssrarni_hu_w(_tmp3, _tmp2, 2); \
|
||||
_tmp4 = __lsx_vaddwev_w_hu(_reg0, _reg1); \
|
||||
_tmp5 = __lsx_vaddwod_w_hu(_reg0, _reg1); \
|
||||
_tmp0 = __lsx_vilvl_w(_tmp5, _tmp4); \
|
||||
_tmp1 = __lsx_vilvh_w(_tmp5, _tmp4); \
|
||||
_tmpr = __lsx_vssrarni_hu_w(_tmp1, _tmp0, 2); \
|
||||
_reg0 = __lsx_vmadd_h(const_8000, const_112, _tmpb); \
|
||||
_reg1 = __lsx_vmadd_h(const_8000, const_112, _tmpr); \
|
||||
_reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg); \
|
||||
_reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg); \
|
||||
_reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr); \
|
||||
@ -787,12 +799,12 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0,
|
||||
__m128i src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
__m128i vec0, vec1, vec2, vec3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
|
||||
__m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
|
||||
__m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
|
||||
__m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
|
||||
__m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
|
||||
__m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
|
||||
__m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_0x70 = __lsx_vldi(0x470);
|
||||
__m128i const_0x4A = __lsx_vldi(0x44A);
|
||||
__m128i const_0x26 = __lsx_vldi(0x426);
|
||||
__m128i const_0x5E = __lsx_vldi(0x45E);
|
||||
__m128i const_0x12 = __lsx_vldi(0x412);
|
||||
__m128i const_0x8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
|
||||
48, src0, src1, src2, src3);
|
||||
@ -814,17 +826,20 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0,
|
||||
tmp5 = __lsx_vpickev_h(vec3, vec2);
|
||||
vec0 = __lsx_vpickev_h(tmp1, tmp0);
|
||||
vec1 = __lsx_vpickod_h(tmp1, tmp0);
|
||||
src0 = __lsx_vavgr_h(vec0, vec1);
|
||||
src0 = __lsx_vadd_h(vec0, vec1);
|
||||
src0 = __lsx_vsrari_h(src0, 2);
|
||||
vec0 = __lsx_vpickev_h(tmp3, tmp2);
|
||||
vec1 = __lsx_vpickod_h(tmp3, tmp2);
|
||||
src1 = __lsx_vavgr_h(vec0, vec1);
|
||||
src1 = __lsx_vadd_h(vec0, vec1);
|
||||
src1 = __lsx_vsrari_h(src1, 2);
|
||||
vec0 = __lsx_vpickev_h(tmp5, tmp4);
|
||||
vec1 = __lsx_vpickod_h(tmp5, tmp4);
|
||||
src2 = __lsx_vavgr_h(vec0, vec1);
|
||||
dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
|
||||
src2 = __lsx_vadd_h(vec0, vec1);
|
||||
src2 = __lsx_vsrari_h(src2, 2);
|
||||
dst0 = __lsx_vmadd_h(const_0x8000, src0, const_0x70);
|
||||
dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
|
||||
dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
|
||||
dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
|
||||
dst1 = __lsx_vmadd_h(const_0x8000, src1, const_0x70);
|
||||
dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
|
||||
dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
|
||||
dst0 = __lsx_vsrai_h(dst0, 8);
|
||||
@ -991,7 +1006,7 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
|
||||
__m128i const_38 = __lsx_vldi(38);
|
||||
__m128i const_94 = __lsx_vldi(94);
|
||||
__m128i const_18 = __lsx_vldi(18);
|
||||
__m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_0x8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
|
||||
src0, src1, src2, src3);
|
||||
@ -999,8 +1014,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
|
||||
tmp1 = __lsx_vpickod_h(src1, src0);
|
||||
tmp2 = __lsx_vpickev_h(src3, src2);
|
||||
tmp3 = __lsx_vpickod_h(src3, src2);
|
||||
reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
|
||||
reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
|
||||
reg0 = __lsx_vmaddwev_h_bu(const_0x8000, tmp0, const_112);
|
||||
reg1 = __lsx_vmaddwev_h_bu(const_0x8000, tmp2, const_112);
|
||||
reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
|
||||
reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
|
||||
reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
|
||||
@ -1011,8 +1026,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
|
||||
reg1 = __lsx_vsrai_h(reg1, 8);
|
||||
dst0 = __lsx_vpickev_b(reg1, reg0);
|
||||
|
||||
reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
|
||||
reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
|
||||
reg0 = __lsx_vmaddwev_h_bu(const_0x8000, tmp1, const_112);
|
||||
reg1 = __lsx_vmaddwev_h_bu(const_0x8000, tmp3, const_112);
|
||||
reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
|
||||
reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
|
||||
reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
|
||||
@ -1530,12 +1545,12 @@ void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
|
||||
__m128i tmp0, tmp1, tmp2, tmp3;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i reg0, reg1, reg2, reg3, dst0;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
|
||||
@ -1639,12 +1654,12 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
|
||||
__m128i tmp0, tmp1, tmp2, tmp3;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i reg0, reg1, reg2, reg3, dst0;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
|
||||
@ -1700,12 +1715,12 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
|
||||
__m128i src0, src1, src2;
|
||||
__m128i nex0, nex1, nex2, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
__m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
|
||||
__m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
|
||||
__m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
|
||||
@ -1753,12 +1768,12 @@ void RAWToUVRow_LSX(const uint8_t* src_raw,
|
||||
__m128i src0, src1, src2;
|
||||
__m128i nex0, nex1, nex2, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
__m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
|
||||
__m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
|
||||
__m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
|
||||
@ -1988,12 +2003,12 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra,
|
||||
__m128i nex0, nex1, nex2, nex3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
|
||||
@ -2036,12 +2051,12 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr,
|
||||
__m128i nex0, nex1, nex2, nex3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
|
||||
@ -2084,12 +2099,12 @@ void RGBAToUVRow_LSX(const uint8_t* src_rgba,
|
||||
__m128i nex0, nex1, nex2, nex3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_112 = __lsx_vldi(0x438);
|
||||
__m128i const_74 = __lsx_vldi(0x425);
|
||||
__m128i const_38 = __lsx_vldi(0x413);
|
||||
__m128i const_94 = __lsx_vldi(0x42F);
|
||||
__m128i const_18 = __lsx_vldi(0x409);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_112 = __lsx_vldi(0x470);
|
||||
__m128i const_74 = __lsx_vldi(0x44A);
|
||||
__m128i const_38 = __lsx_vldi(0x426);
|
||||
__m128i const_94 = __lsx_vldi(0x45E);
|
||||
__m128i const_18 = __lsx_vldi(0x412);
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
|
||||
@ -2130,15 +2145,15 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb,
|
||||
int len = width / 16;
|
||||
__m128i src0, src1, src2, src3;
|
||||
__m128i nex0, nex1, nex2, nex3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3;
|
||||
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
|
||||
__m128i reg0, reg1, dst0;
|
||||
__m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
|
||||
__m128i const_63 = __lsx_vldi(0x43F);
|
||||
__m128i const_42 = __lsx_vldi(0x42A);
|
||||
__m128i const_128 = __lsx_vldi(0x480);
|
||||
__m128i const_85 = __lsx_vldi(0x455);
|
||||
__m128i const_43 = __lsx_vldi(0x42B);
|
||||
__m128i const_107 = __lsx_vldi(0x46B);
|
||||
__m128i const_21 = __lsx_vldi(0x415);
|
||||
__m128i const_53 = __lsx_vldi(0x435);
|
||||
__m128i const_10 = __lsx_vldi(0x40A);
|
||||
__m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
|
||||
__m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
|
||||
@ -2165,15 +2180,27 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb,
|
||||
tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
|
||||
reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
|
||||
reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
|
||||
tmpb = __lsx_vavgr_hu(tmp0, tmp1);
|
||||
tmpg = __lsx_vavgr_hu(tmp2, tmp3);
|
||||
tmpr = __lsx_vavgr_hu(reg0, reg1);
|
||||
reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
|
||||
reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
|
||||
reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
|
||||
reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
|
||||
reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
|
||||
reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
|
||||
tmp4 = __lsx_vaddwev_w_hu(tmp0, tmp1);
|
||||
tmp5 = __lsx_vaddwod_w_hu(tmp0, tmp1);
|
||||
tmp0 = __lsx_vilvl_w(tmp5, tmp4);
|
||||
tmp1 = __lsx_vilvh_w(tmp5, tmp4);
|
||||
tmpb = __lsx_vssrarni_hu_w(tmp1, tmp0, 2);
|
||||
tmp4 = __lsx_vaddwev_w_hu(tmp2, tmp3);
|
||||
tmp5 = __lsx_vaddwod_w_hu(tmp2, tmp3);
|
||||
tmp2 = __lsx_vilvl_w(tmp5, tmp4);
|
||||
tmp3 = __lsx_vilvh_w(tmp5, tmp4);
|
||||
tmpg = __lsx_vssrarni_hu_w(tmp3, tmp2, 2);
|
||||
tmp4 = __lsx_vaddwev_w_hu(reg0, reg1);
|
||||
tmp5 = __lsx_vaddwod_w_hu(reg0, reg1);
|
||||
tmp0 = __lsx_vilvl_w(tmp5, tmp4);
|
||||
tmp1 = __lsx_vilvh_w(tmp5, tmp4);
|
||||
tmpr = __lsx_vssrarni_hu_w(tmp1, tmp0, 2);
|
||||
reg0 = __lsx_vmadd_h(const_8000, const_128, tmpb);
|
||||
reg1 = __lsx_vmadd_h(const_8000, const_128, tmpr);
|
||||
reg0 = __lsx_vmsub_h(reg0, const_85, tmpg);
|
||||
reg1 = __lsx_vmsub_h(reg1, const_107, tmpg);
|
||||
reg0 = __lsx_vmsub_h(reg0, const_43, tmpr);
|
||||
reg1 = __lsx_vmsub_h(reg1, const_21, tmpb);
|
||||
dst0 = __lsx_vpickod_b(reg1, reg0);
|
||||
__lsx_vstelm_d(dst0, dst_u, 0, 0);
|
||||
__lsx_vstelm_d(dst0, dst_v, 0, 1);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user