loong64: UV subsample's 4-pixel rounding average and ARGBToJ444 fixed-point scaling

The UV subsample's 4-pixel rounding average and ARGBToJ444 fixed-point scaling were updated in d32d19cc and c060118b. The LoongArch optimization is updated now. Bug: 381138208 Change-Id: I3585d72564e4fffe514599b1a9b4fee8fbbd0266 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6878364 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
2025-12-06 16:56:55 +08:00 · 2025-08-19 17:11:15 +08:00 · 2025-08-19 17:11:15 +08:00 · eb4e4736a4
commit eb4e4736a4
parent 70458840c4
2 changed files with 202 additions and 152 deletions
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@ -193,18 +193,30 @@ extern "C" {
 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _reg0, _reg1) \
  {                                                                     \
-    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                 \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5;                   \
    _tmp0 = __lasx_xvaddwev_h_bu(_tmpb, _nexb);                         \
    _tmp1 = __lasx_xvaddwod_h_bu(_tmpb, _nexb);                         \
    _tmp2 = __lasx_xvaddwev_h_bu(_tmpg, _nexg);                         \
    _tmp3 = __lasx_xvaddwod_h_bu(_tmpg, _nexg);                         \
    _reg0 = __lasx_xvaddwev_h_bu(_tmpr, _nexr);                         \
    _reg1 = __lasx_xvaddwod_h_bu(_tmpr, _nexr);                         \
-    _tmpb = __lasx_xvavgr_hu(_tmp0, _tmp1);                             \
+    _tmp4 = __lasx_xvaddwev_w_hu(_tmp0, _tmp1);                         \
-    _tmpg = __lasx_xvavgr_hu(_tmp2, _tmp3);                             \
+    _tmp5 = __lasx_xvaddwod_w_hu(_tmp0, _tmp1);                         \
-    _tmpr = __lasx_xvavgr_hu(_reg0, _reg1);                             \
+    _tmp0 = __lasx_xvilvl_w(_tmp5, _tmp4);                              \
-    _reg0 = __lasx_xvmadd_h(const_8080, const_112, _tmpb);              \
+    _tmp1 = __lasx_xvilvh_w(_tmp5, _tmp4);                              \
-    _reg1 = __lasx_xvmadd_h(const_8080, const_112, _tmpr);              \
+    _tmpb = __lasx_xvssrarni_hu_w(_tmp1, _tmp0, 2);                     \
    _tmp4 = __lasx_xvaddwev_w_hu(_tmp2, _tmp3);                         \
    _tmp5 = __lasx_xvaddwod_w_hu(_tmp2, _tmp3);                         \
    _tmp2 = __lasx_xvilvl_w(_tmp5, _tmp4);                              \
    _tmp3 = __lasx_xvilvh_w(_tmp5, _tmp4);                              \
    _tmpg = __lasx_xvssrarni_hu_w(_tmp3, _tmp2, 2);                     \
    _tmp4 = __lasx_xvaddwev_w_hu(_reg0, _reg1);                         \
    _tmp5 = __lasx_xvaddwod_w_hu(_reg0, _reg1);                         \
    _tmp0 = __lasx_xvilvl_w(_tmp5, _tmp4);                              \
    _tmp1 = __lasx_xvilvh_w(_tmp5, _tmp4);                              \
    _tmpr = __lasx_xvssrarni_hu_w(_tmp1, _tmp0, 2);                     \
    _reg0 = __lasx_xvmadd_h(const_8000, const_112, _tmpb);              \
    _reg1 = __lasx_xvmadd_h(const_8000, const_112, _tmpr);              \
    _reg0 = __lasx_xvmsub_h(_reg0, const_74, _tmpg);                    \
    _reg1 = __lasx_xvmsub_h(_reg1, const_94, _tmpg);                    \
    _reg0 = __lasx_xvmsub_h(_reg0, const_38, _tmpr);                    \
@ -787,20 +799,16 @@ void ARGBToUVRow_LASX(const uint8_t* src_argb0,
  __m256i src0, src1, src2, src3, src4, src5, src6, src7;
  __m256i vec0, vec1, vec2, vec3;
  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
-  __m256i const_0x70 = {0x0038003800380038, 0x0038003800380038,
+  __m256i const_0x70 = __lasx_xvldi(0x470);
-                        0x0038003800380038, 0x0038003800380038};
+  __m256i const_0x4A = __lasx_xvldi(0x44A);
-  __m256i const_0x4A = {0x0025002500250025, 0x0025002500250025,
+  __m256i const_0x26 = __lasx_xvldi(0x426);
-                        0x0025002500250025, 0x0025002500250025};
+  __m256i const_0x5E = __lasx_xvldi(0x45E);
-  __m256i const_0x26 = {0x0013001300130013, 0x0013001300130013,
+  __m256i const_0x12 = __lasx_xvldi(0x412);
-                        0x0013001300130013, 0x0013001300130013};
+
  __m256i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f,
                        0x002f002f002f002f, 0x002f002f002f002f};
  __m256i const_0x12 = {0x0009000900090009, 0x0009000900090009,
                        0x0009000900090009, 0x0009000900090009};
  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
                     0x0000000700000003};
-  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_0x8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                        0x8080808080808080, 0x8080808080808080};
+                                        0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lasx_xvld, src_argb0, 0, src_argb0, 32, src_argb0, 64,
@ -823,17 +831,20 @@ void ARGBToUVRow_LASX(const uint8_t* src_argb0,
    tmp5 = __lasx_xvpickev_h(vec3, vec2);
    vec0 = __lasx_xvpickev_h(tmp1, tmp0);
    vec1 = __lasx_xvpickod_h(tmp1, tmp0);
-    src0 = __lasx_xvavgr_h(vec0, vec1);
+    src0 = __lasx_xvadd_h(vec0, vec1);
    src0 = __lasx_xvsrari_h(src0, 2);
    vec0 = __lasx_xvpickev_h(tmp3, tmp2);
    vec1 = __lasx_xvpickod_h(tmp3, tmp2);
-    src1 = __lasx_xvavgr_h(vec0, vec1);
+    src1 = __lasx_xvadd_h(vec0, vec1);
    src1 = __lasx_xvsrari_h(src1, 2);
    vec0 = __lasx_xvpickev_h(tmp5, tmp4);
    vec1 = __lasx_xvpickod_h(tmp5, tmp4);
-    src2 = __lasx_xvavgr_h(vec0, vec1);
+    src2 = __lasx_xvadd_h(vec0, vec1);
-    dst0 = __lasx_xvmadd_h(const_0x8080, src0, const_0x70);
+    src2 = __lasx_xvsrari_h(src2, 2);
    dst0 = __lasx_xvmadd_h(const_0x8000, src0, const_0x70);
    dst0 = __lasx_xvmsub_h(dst0, src2, const_0x4A);
    dst0 = __lasx_xvmsub_h(dst0, src1, const_0x26);
-    dst1 = __lasx_xvmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lasx_xvmadd_h(const_0x8000, src1, const_0x70);
    dst1 = __lasx_xvmsub_h(dst1, src2, const_0x5E);
    dst1 = __lasx_xvmsub_h(dst1, src0, const_0x12);
    dst0 = __lasx_xvperm_w(dst0, control);
@ -1037,8 +1048,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
  __m256i const_38 = __lasx_xvldi(38);
  __m256i const_94 = __lasx_xvldi(94);
  __m256i const_18 = __lasx_xvldi(18);
-  __m256i const_0x8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_0x8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                        0x8080808080808080, 0x8080808080808080};
+                                        0x8000800080008000, 0x8000800080008000};
  __m256i control = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
                     0x0000000700000003};
  for (x = 0; x < len; x++) {
@ -1048,8 +1059,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
    tmp1 = __lasx_xvpickod_h(src1, src0);
    tmp2 = __lasx_xvpickev_h(src3, src2);
    tmp3 = __lasx_xvpickod_h(src3, src2);
-    reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg0 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp0, const_112);
-    reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg1 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp2, const_112);
    reg2 = __lasx_xvmulwod_h_bu(tmp0, const_74);
    reg3 = __lasx_xvmulwod_h_bu(tmp2, const_74);
    reg2 = __lasx_xvmaddwev_h_bu(reg2, tmp1, const_38);
@ -1058,8 +1069,8 @@ void ARGBToUV444Row_LASX(const uint8_t* src_argb,
    reg1 = __lasx_xvsub_h(reg1, reg3);
    dst0 = __lasx_xvssrani_b_h(reg1, reg0, 8);
    dst0 = __lasx_xvperm_w(dst0, control);
-    reg0 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg0 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp1, const_112);
-    reg1 = __lasx_xvmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg1 = __lasx_xvmaddwev_h_bu(const_0x8000, tmp3, const_112);
    reg2 = __lasx_xvmulwev_h_bu(tmp0, const_18);
    reg3 = __lasx_xvmulwev_h_bu(tmp2, const_18);
    reg2 = __lasx_xvmaddwod_h_bu(reg2, tmp0, const_94);
@ -1606,13 +1617,13 @@ void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
  __m256i tmp0, tmp1, tmp2, tmp3;
  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
  __m256i reg0, reg1, reg2, reg3, dst0;
-  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_112 = __lasx_xvldi(0x470);
-  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_74 = __lasx_xvldi(0x44A);
-  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_38 = __lasx_xvldi(0x426);
-  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_94 = __lasx_xvldi(0x45E);
-  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_18 = __lasx_xvldi(0x412);
-  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                      0x8080808080808080, 0x8080808080808080};
+                                      0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
@ -1723,13 +1734,13 @@ void RGB565ToUVRow_LASX(const uint8_t* src_rgb565,
  __m256i tmp0, tmp1, tmp2, tmp3;
  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
  __m256i reg0, reg1, reg2, reg3, dst0;
-  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_112 = __lasx_xvldi(0x470);
-  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_74 = __lasx_xvldi(0x44A);
-  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_38 = __lasx_xvldi(0x426);
-  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_94 = __lasx_xvldi(0x45E);
-  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_18 = __lasx_xvldi(0x412);
-  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                      0x8080808080808080, 0x8080808080808080};
+                                      0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lasx_xvld, src_rgb565, 0, src_rgb565, 32, next_rgb565, 0,
@ -1790,13 +1801,13 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
  __m256i src0, src1, src2, reg0, reg1, reg2;
  __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_112 = __lasx_xvldi(0x470);
-  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_74 = __lasx_xvldi(0x44A);
-  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_38 = __lasx_xvldi(0x426);
-  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_94 = __lasx_xvldi(0x45E);
-  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_18 = __lasx_xvldi(0x412);
-  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                      0x8080808080808080, 0x8080808080808080};
+                                      0x8000800080008000, 0x8000800080008000};
  __m256i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18,
                      0x15120F0C09060300, 0x00000000001E1B18};
  __m256i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908,
@ -1853,13 +1864,13 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
  __m256i src0, src1, src2, reg0, reg1, reg2;
  __m256i nex0, nex1, nex2, dst0, tmp0, tmp1, tmp2;
  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m256i const_112 = __lasx_xvldi(0x438);
+  __m256i const_112 = __lasx_xvldi(0x470);
-  __m256i const_74 = __lasx_xvldi(0x425);
+  __m256i const_74 = __lasx_xvldi(0x44A);
-  __m256i const_38 = __lasx_xvldi(0x413);
+  __m256i const_38 = __lasx_xvldi(0x426);
-  __m256i const_94 = __lasx_xvldi(0x42F);
+  __m256i const_94 = __lasx_xvldi(0x45E);
-  __m256i const_18 = __lasx_xvldi(0x409);
+  __m256i const_18 = __lasx_xvldi(0x412);
-  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
+  __m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-                                      0x8080808080808080, 0x8080808080808080};
+                                      0x8000800080008000, 0x8000800080008000};
  __m256i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18,
                      0x15120F0C09060300, 0x00000000001E1B18};
  __m256i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908,
@ -2239,16 +2250,16 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
  int len = width / 32;
  __m256i src0, src1, src2, src3;
  __m256i nex0, nex1, nex2, nex3;
-  __m256i tmp0, tmp1, tmp2, tmp3;
+  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  __m256i reg0, reg1, dst0;
  __m256i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m256i const_63 = __lasx_xvldi(0x43F);
+  __m256i const_128 = __lasx_xvldi(0x480);
-  __m256i const_42 = __lasx_xvldi(0x42A);
+  __m256i const_85 = __lasx_xvldi(0x455);
  __m256i const_43 = __lasx_xvldi(0x42B);
  __m256i const_107 = __lasx_xvldi(0x46B);
  __m256i const_21 = __lasx_xvldi(0x415);
-  __m256i const_53 = __lasx_xvldi(0x435);
+  __m256i const_8000 = (__m256i)v4u64{0x8000800080008000, 0x8000800080008000,
-  __m256i const_10 = __lasx_xvldi(0x40A);
+                                      0x8000800080008000, 0x8000800080008000};
  __m256i const_8080 = (__m256i)v4u64{0x8080808080808080, 0x8080808080808080,
                                      0x8080808080808080, 0x8080808080808080};
  __m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
                   0x1F1D0F0D1B190B09};
@ -2277,15 +2288,27 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
    tmp3 = __lasx_xvaddwod_h_bu(tmpg, nexg);
    reg0 = __lasx_xvaddwev_h_bu(tmpr, nexr);
    reg1 = __lasx_xvaddwod_h_bu(tmpr, nexr);
-    tmpb = __lasx_xvavgr_hu(tmp0, tmp1);
+    tmp4 = __lasx_xvaddwev_w_hu(tmp0, tmp1);
-    tmpg = __lasx_xvavgr_hu(tmp2, tmp3);
+    tmp5 = __lasx_xvaddwod_w_hu(tmp0, tmp1);
-    tmpr = __lasx_xvavgr_hu(reg0, reg1);
+    tmp0 = __lasx_xvilvl_w(tmp5, tmp4);
-    reg0 = __lasx_xvmadd_h(const_8080, const_63, tmpb);
+    tmp1 = __lasx_xvilvh_w(tmp5, tmp4);
-    reg1 = __lasx_xvmadd_h(const_8080, const_63, tmpr);
+    tmpb = __lasx_xvssrarni_hu_w(tmp1, tmp0, 2);
-    reg0 = __lasx_xvmsub_h(reg0, const_42, tmpg);
+    tmp4 = __lasx_xvaddwev_w_hu(tmp2, tmp3);
-    reg1 = __lasx_xvmsub_h(reg1, const_53, tmpg);
+    tmp5 = __lasx_xvaddwod_w_hu(tmp2, tmp3);
-    reg0 = __lasx_xvmsub_h(reg0, const_21, tmpr);
+    tmp2 = __lasx_xvilvl_w(tmp5, tmp4);
-    reg1 = __lasx_xvmsub_h(reg1, const_10, tmpb);
+    tmp3 = __lasx_xvilvh_w(tmp5, tmp4);
    tmpg = __lasx_xvssrarni_hu_w(tmp3, tmp2, 2);
    tmp4 = __lasx_xvaddwev_w_hu(reg0, reg1);
    tmp5 = __lasx_xvaddwod_w_hu(reg0, reg1);
    tmp0 = __lasx_xvilvl_w(tmp5, tmp4);
    tmp1 = __lasx_xvilvh_w(tmp5, tmp4);
    tmpr = __lasx_xvssrarni_hu_w(tmp1, tmp0, 2);
    reg0 = __lasx_xvmadd_h(const_8000, const_128, tmpb);
    reg1 = __lasx_xvmadd_h(const_8000, const_128, tmpr);
    reg0 = __lasx_xvmsub_h(reg0, const_85, tmpg);
    reg1 = __lasx_xvmsub_h(reg1, const_107, tmpg);
    reg0 = __lasx_xvmsub_h(reg0, const_43, tmpr);
    reg1 = __lasx_xvmsub_h(reg1, const_21, tmpb);
    dst0 = __lasx_xvpackod_b(reg1, reg0);
    tmp0 = __lasx_xvpermi_d(dst0, 0x44);
    tmp1 = __lasx_xvpermi_d(dst0, 0xEE);
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -239,7 +239,7 @@ extern "C" {
 #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
  {                                                              \
-    __m128i _tmp0, _tmp1, _tmp2, _tmp3;                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5;            \
    __m128i _reg0, _reg1;                                        \
    _tmp0 = __lsx_vaddwev_h_bu(_tmpb, _nexb);                    \
    _tmp1 = __lsx_vaddwod_h_bu(_tmpb, _nexb);                    \
@ -247,11 +247,23 @@ extern "C" {
    _tmp3 = __lsx_vaddwod_h_bu(_tmpg, _nexg);                    \
    _reg0 = __lsx_vaddwev_h_bu(_tmpr, _nexr);                    \
    _reg1 = __lsx_vaddwod_h_bu(_tmpr, _nexr);                    \
-    _tmpb = __lsx_vavgr_hu(_tmp0, _tmp1);                        \
+    _tmp4 = __lsx_vaddwev_w_hu(_tmp0, _tmp1);                    \
-    _tmpg = __lsx_vavgr_hu(_tmp2, _tmp3);                        \
+    _tmp5 = __lsx_vaddwod_w_hu(_tmp0, _tmp1);                    \
-    _tmpr = __lsx_vavgr_hu(_reg0, _reg1);                        \
+    _tmp0 = __lsx_vilvl_w(_tmp5, _tmp4);                         \
-    _reg0 = __lsx_vmadd_h(const_8080, const_112, _tmpb);         \
+    _tmp1 = __lsx_vilvh_w(_tmp5, _tmp4);                         \
-    _reg1 = __lsx_vmadd_h(const_8080, const_112, _tmpr);         \
+    _tmpb = __lsx_vssrarni_hu_w(_tmp1, _tmp0, 2);                \
    _tmp4 = __lsx_vaddwev_w_hu(_tmp2, _tmp3);                    \
    _tmp5 = __lsx_vaddwod_w_hu(_tmp2, _tmp3);                    \
    _tmp2 = __lsx_vilvl_w(_tmp5, _tmp4);                         \
    _tmp3 = __lsx_vilvh_w(_tmp5, _tmp4);                         \
    _tmpg = __lsx_vssrarni_hu_w(_tmp3, _tmp2, 2);                \
    _tmp4 = __lsx_vaddwev_w_hu(_reg0, _reg1);                    \
    _tmp5 = __lsx_vaddwod_w_hu(_reg0, _reg1);                    \
    _tmp0 = __lsx_vilvl_w(_tmp5, _tmp4);                         \
    _tmp1 = __lsx_vilvh_w(_tmp5, _tmp4);                         \
    _tmpr = __lsx_vssrarni_hu_w(_tmp1, _tmp0, 2);                \
    _reg0 = __lsx_vmadd_h(const_8000, const_112, _tmpb);         \
    _reg1 = __lsx_vmadd_h(const_8000, const_112, _tmpr);         \
    _reg0 = __lsx_vmsub_h(_reg0, const_74, _tmpg);               \
    _reg1 = __lsx_vmsub_h(_reg1, const_94, _tmpg);               \
    _reg0 = __lsx_vmsub_h(_reg0, const_38, _tmpr);               \
@ -787,12 +799,12 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0,
  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
  __m128i vec0, vec1, vec2, vec3;
  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, dst0, dst1;
-  __m128i const_0x70 = {0x0038003800380038, 0x0038003800380038};
+  __m128i const_0x70 = __lsx_vldi(0x470);
-  __m128i const_0x4A = {0x0025002500250025, 0x0025002500250025};
+  __m128i const_0x4A = __lsx_vldi(0x44A);
-  __m128i const_0x26 = {0x0013001300130013, 0x0013001300130013};
+  __m128i const_0x26 = __lsx_vldi(0x426);
-  __m128i const_0x5E = {0x002f002f002f002f, 0x002f002f002f002f};
+  __m128i const_0x5E = __lsx_vldi(0x45E);
-  __m128i const_0x12 = {0x0009000900090009, 0x0009000900090009};
+  __m128i const_0x12 = __lsx_vldi(0x412);
-  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_0x8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_argb0, 0, src_argb0, 16, src_argb0, 32, src_argb0,
              48, src0, src1, src2, src3);
@ -814,17 +826,20 @@ void ARGBToUVRow_LSX(const uint8_t* src_argb0,
    tmp5 = __lsx_vpickev_h(vec3, vec2);
    vec0 = __lsx_vpickev_h(tmp1, tmp0);
    vec1 = __lsx_vpickod_h(tmp1, tmp0);
-    src0 = __lsx_vavgr_h(vec0, vec1);
+    src0 = __lsx_vadd_h(vec0, vec1);
    src0 = __lsx_vsrari_h(src0, 2);
    vec0 = __lsx_vpickev_h(tmp3, tmp2);
    vec1 = __lsx_vpickod_h(tmp3, tmp2);
-    src1 = __lsx_vavgr_h(vec0, vec1);
+    src1 = __lsx_vadd_h(vec0, vec1);
    src1 = __lsx_vsrari_h(src1, 2);
    vec0 = __lsx_vpickev_h(tmp5, tmp4);
    vec1 = __lsx_vpickod_h(tmp5, tmp4);
-    src2 = __lsx_vavgr_h(vec0, vec1);
+    src2 = __lsx_vadd_h(vec0, vec1);
-    dst0 = __lsx_vmadd_h(const_0x8080, src0, const_0x70);
+    src2 = __lsx_vsrari_h(src2, 2);
    dst0 = __lsx_vmadd_h(const_0x8000, src0, const_0x70);
    dst0 = __lsx_vmsub_h(dst0, src2, const_0x4A);
    dst0 = __lsx_vmsub_h(dst0, src1, const_0x26);
-    dst1 = __lsx_vmadd_h(const_0x8080, src1, const_0x70);
+    dst1 = __lsx_vmadd_h(const_0x8000, src1, const_0x70);
    dst1 = __lsx_vmsub_h(dst1, src2, const_0x5E);
    dst1 = __lsx_vmsub_h(dst1, src0, const_0x12);
    dst0 = __lsx_vsrai_h(dst0, 8);
@ -991,7 +1006,7 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
  __m128i const_38 = __lsx_vldi(38);
  __m128i const_94 = __lsx_vldi(94);
  __m128i const_18 = __lsx_vldi(18);
-  __m128i const_0x8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_0x8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
              src0, src1, src2, src3);
@ -999,8 +1014,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
    tmp1 = __lsx_vpickod_h(src1, src0);
    tmp2 = __lsx_vpickev_h(src3, src2);
    tmp3 = __lsx_vpickod_h(src3, src2);
-    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp0, const_112);
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8000, tmp0, const_112);
-    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp2, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8000, tmp2, const_112);
    reg2 = __lsx_vmulwod_h_bu(tmp0, const_74);
    reg3 = __lsx_vmulwod_h_bu(tmp2, const_74);
    reg2 = __lsx_vmaddwev_h_bu(reg2, tmp1, const_38);
@ -1011,8 +1026,8 @@ void ARGBToUV444Row_LSX(const uint8_t* src_argb,
    reg1 = __lsx_vsrai_h(reg1, 8);
    dst0 = __lsx_vpickev_b(reg1, reg0);
-    reg0 = __lsx_vmaddwev_h_bu(const_0x8080, tmp1, const_112);
+    reg0 = __lsx_vmaddwev_h_bu(const_0x8000, tmp1, const_112);
-    reg1 = __lsx_vmaddwev_h_bu(const_0x8080, tmp3, const_112);
+    reg1 = __lsx_vmaddwev_h_bu(const_0x8000, tmp3, const_112);
    reg2 = __lsx_vmulwev_h_bu(tmp0, const_18);
    reg3 = __lsx_vmulwev_h_bu(tmp2, const_18);
    reg2 = __lsx_vmaddwod_h_bu(reg2, tmp0, const_94);
@ -1530,12 +1545,12 @@ void ARGB1555ToUVRow_LSX(const uint8_t* src_argb1555,
  __m128i tmp0, tmp1, tmp2, tmp3;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
  __m128i reg0, reg1, reg2, reg3, dst0;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_argb1555, 0, src_argb1555, 16, next_argb1555, 0,
@ -1639,12 +1654,12 @@ void RGB565ToUVRow_LSX(const uint8_t* src_rgb565,
  __m128i tmp0, tmp1, tmp2, tmp3;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
  __m128i reg0, reg1, reg2, reg3, dst0;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_rgb565, 0, src_rgb565, 16, next_rgb565, 0,
@ -1700,12 +1715,12 @@ void RGB24ToUVRow_LSX(const uint8_t* src_rgb24,
  __m128i src0, src1, src2;
  __m128i nex0, nex1, nex2, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  __m128i shuff0_b = {0x15120F0C09060300, 0x00000000001E1B18};
  __m128i shuff1_b = {0x0706050403020100, 0x1D1A1714110A0908};
  __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
@ -1753,12 +1768,12 @@ void RAWToUVRow_LSX(const uint8_t* src_raw,
  __m128i src0, src1, src2;
  __m128i nex0, nex1, nex2, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  __m128i shuff0_r = {0x15120F0C09060300, 0x00000000001E1B18};
  __m128i shuff1_r = {0x0706050403020100, 0x1D1A1714110A0908};
  __m128i shuff0_g = {0x1613100D0A070401, 0x00000000001F1C19};
@ -1988,12 +2003,12 @@ void BGRAToUVRow_LSX(const uint8_t* src_bgra,
  __m128i nex0, nex1, nex2, nex3;
  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_bgra, 0, src_bgra, 16, src_bgra, 32, src_bgra, 48,
@ -2036,12 +2051,12 @@ void ABGRToUVRow_LSX(const uint8_t* src_abgr,
  __m128i nex0, nex1, nex2, nex3;
  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_abgr, 0, src_abgr, 16, src_abgr, 32, src_abgr, 48,
@ -2084,12 +2099,12 @@ void RGBAToUVRow_LSX(const uint8_t* src_rgba,
  __m128i nex0, nex1, nex2, nex3;
  __m128i tmp0, tmp1, tmp2, tmp3, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_112 = __lsx_vldi(0x438);
+  __m128i const_112 = __lsx_vldi(0x470);
-  __m128i const_74 = __lsx_vldi(0x425);
+  __m128i const_74 = __lsx_vldi(0x44A);
-  __m128i const_38 = __lsx_vldi(0x413);
+  __m128i const_38 = __lsx_vldi(0x426);
-  __m128i const_94 = __lsx_vldi(0x42F);
+  __m128i const_94 = __lsx_vldi(0x45E);
-  __m128i const_18 = __lsx_vldi(0x409);
+  __m128i const_18 = __lsx_vldi(0x412);
-  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_rgba, 0, src_rgba, 16, src_rgba, 32, src_rgba, 48,
@ -2130,15 +2145,15 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb,
  int len = width / 16;
  __m128i src0, src1, src2, src3;
  __m128i nex0, nex1, nex2, nex3;
-  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
  __m128i reg0, reg1, dst0;
  __m128i tmpb, tmpg, tmpr, nexb, nexg, nexr;
-  __m128i const_63 = __lsx_vldi(0x43F);
+  __m128i const_128 = __lsx_vldi(0x480);
-  __m128i const_42 = __lsx_vldi(0x42A);
+  __m128i const_85 = __lsx_vldi(0x455);
  __m128i const_43 = __lsx_vldi(0x42B);
  __m128i const_107 = __lsx_vldi(0x46B);
  __m128i const_21 = __lsx_vldi(0x415);
-  __m128i const_53 = __lsx_vldi(0x435);
+  __m128i const_8000 = (__m128i)v2u64{0x8000800080008000, 0x8000800080008000};
  __m128i const_10 = __lsx_vldi(0x40A);
  __m128i const_8080 = (__m128i)v2u64{0x8080808080808080, 0x8080808080808080};
  for (x = 0; x < len; x++) {
    DUP4_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src_argb, 32, src_argb, 48,
@ -2165,15 +2180,27 @@ void ARGBToUVJRow_LSX(const uint8_t* src_argb,
    tmp3 = __lsx_vaddwod_h_bu(tmpg, nexg);
    reg0 = __lsx_vaddwev_h_bu(tmpr, nexr);
    reg1 = __lsx_vaddwod_h_bu(tmpr, nexr);
-    tmpb = __lsx_vavgr_hu(tmp0, tmp1);
+    tmp4 = __lsx_vaddwev_w_hu(tmp0, tmp1);
-    tmpg = __lsx_vavgr_hu(tmp2, tmp3);
+    tmp5 = __lsx_vaddwod_w_hu(tmp0, tmp1);
-    tmpr = __lsx_vavgr_hu(reg0, reg1);
+    tmp0 = __lsx_vilvl_w(tmp5, tmp4);
-    reg0 = __lsx_vmadd_h(const_8080, const_63, tmpb);
+    tmp1 = __lsx_vilvh_w(tmp5, tmp4);
-    reg1 = __lsx_vmadd_h(const_8080, const_63, tmpr);
+    tmpb = __lsx_vssrarni_hu_w(tmp1, tmp0, 2);
-    reg0 = __lsx_vmsub_h(reg0, const_42, tmpg);
+    tmp4 = __lsx_vaddwev_w_hu(tmp2, tmp3);
-    reg1 = __lsx_vmsub_h(reg1, const_53, tmpg);
+    tmp5 = __lsx_vaddwod_w_hu(tmp2, tmp3);
-    reg0 = __lsx_vmsub_h(reg0, const_21, tmpr);
+    tmp2 = __lsx_vilvl_w(tmp5, tmp4);
-    reg1 = __lsx_vmsub_h(reg1, const_10, tmpb);
+    tmp3 = __lsx_vilvh_w(tmp5, tmp4);
    tmpg = __lsx_vssrarni_hu_w(tmp3, tmp2, 2);
    tmp4 = __lsx_vaddwev_w_hu(reg0, reg1);
    tmp5 = __lsx_vaddwod_w_hu(reg0, reg1);
    tmp0 = __lsx_vilvl_w(tmp5, tmp4);
    tmp1 = __lsx_vilvh_w(tmp5, tmp4);
    tmpr = __lsx_vssrarni_hu_w(tmp1, tmp0, 2);
    reg0 = __lsx_vmadd_h(const_8000, const_128, tmpb);
    reg1 = __lsx_vmadd_h(const_8000, const_128, tmpr);
    reg0 = __lsx_vmsub_h(reg0, const_85, tmpg);
    reg1 = __lsx_vmsub_h(reg1, const_107, tmpg);
    reg0 = __lsx_vmsub_h(reg0, const_43, tmpr);
    reg1 = __lsx_vmsub_h(reg1, const_21, tmpb);
    dst0 = __lsx_vpickod_b(reg1, reg0);
    __lsx_vstelm_d(dst0, dst_u, 0, 0);
    __lsx_vstelm_d(dst0, dst_v, 0, 1);