Enable I422To{ARGB,RGBA,RGB24}Row_RVV

Run on SiFive internal FPGA: I422ToARGB_Opt (~10x vs scalar) I422ToRGBA_Opt (~10x vs scalar) I420ToRGB24_Opt (~8x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 This CL manually sets rounding mode, since we use fixed-point vector narrowing clip. There is no definition about default value for fixed-point rounding mode. https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#38-vector-fixed-point-rounding-mode-register-vxrm The behavior could be different on differet paltforms. To avoid unexpected behavior, we set rounding mode manually. Change-Id: I90f0dcb90c37f7da7caab8eb1df6c9c7a3c874a8 Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4512373 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
2025-12-07 17:26:49 +08:00 · 2023-05-02 00:33:27 -07:00 · 2023-05-02 00:33:27 -07:00 · 964d963afb
commit 964d963afb
parent 1d940cc570
5 changed files with 221 additions and 5 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -782,6 +782,9 @@ extern "C" {
 #define HAS_ARGBTOAR64ROW_RVV
 #define HAS_ARGBTORAWROW_RVV
 #define HAS_ARGBTORGB24ROW_RVV
+#define HAS_I422TOARGBROW_RVV
+#define HAS_I422TORGB24ROW_RVV
+#define HAS_I422TORGBAROW_RVV
 #define HAS_MERGEARGBROW_RVV
 #define HAS_MERGERGBROW_RVV
 #define HAS_MERGEXRGBROW_RVV
@ -853,8 +856,8 @@ typedef uint32_t ulvec32[8];
 typedef uint8_t ulvec8[32];
 #endif

-#if defined(__aarch64__) || defined(__arm__)
-// This struct is for ARM color conversion.
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+// This struct is for ARM and RISC-V color conversion.
 struct YuvConstants {
  uvec8 kUVCoeff;
  vec16 kRGBCoeffBias;
@ -1059,6 +1062,24 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        uint8_t* dst_argb,
                        const struct YuvConstants* yuvconstants,
                        int width);
+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
 void I444ToARGBRow_MSA(const uint8_t* src_y,
                       const uint8_t* src_u,
                       const uint8_t* src_v,
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -136,6 +136,11 @@ int I420ToARGBMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -385,6 +390,11 @@ int I422ToARGBMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -4511,6 +4521,11 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@ -4734,6 +4749,11 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGBAROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGBARow = I422ToRGBARow_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@ -4859,6 +4879,11 @@ int I420ToRGB24Matrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@ -5056,6 +5081,11 @@ int I422ToRGB24Matrix(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TORGB24ROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToRGB24Row = I422ToRGB24Row_RVV;
+  }
+#endif

  for (y = 0; y < height; ++y) {
    I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@ -5620,6 +5650,11 @@ int I420ToRGB565Dither(const uint8_t* src_y,
    }
  }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1484,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {

 // clang-format off

-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 // Bias values include subtract 128 from U and V, bias from Y and rounding.
 // For B and R bias is negative. For G bias is positive.
 #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR)                             \
@ -1680,7 +1680,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR)

 #undef MAKEYUVCONSTANTS

-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
 #define LOAD_YUV_CONSTANTS                 \
  int ub = yuvconstants->kUVCoeff[0];      \
  int vr = yuvconstants->kUVCoeff[1];      \
@ -1868,7 +1868,7 @@ static __inline void YPixel(uint8_t y,
                            uint8_t* g,
                            uint8_t* r,
                            const struct YuvConstants* yuvconstants) {
-#if defined(__aarch64__) || defined(__arm__)
+#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
  int yg = yuvconstants->kRGBCoeffBias[0];
  int ygb = yuvconstants->kRGBCoeffBias[4];
 #else
--- a/source/row_rvv.cc
+++ b/source/row_rvv.cc
@ -27,6 +27,72 @@ namespace libyuv {
 extern "C" {
 #endif

+// Fill YUV -> RGB conversion constants into vectors
+// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
+// register) is set to round-down mode(2).
+#define YUVTORGB_SETUP(yuvconst, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, \
+                       v_br)                                                   \
+  {                                                                            \
+    asm volatile("csrwi vxrm, 2");                                             \
+    vl = __riscv_vsetvl_e8m1(w);                                               \
+    v_ub = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[0], vl);                    \
+    v_vr = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[1], vl);                    \
+    v_ug = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[2], vl);                    \
+    v_vg = __riscv_vmv_v_x_u8m1(yuvconst->kUVCoeff[3], vl);                    \
+    v_yg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[0], vl);              \
+    v_bb = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[1], vl);              \
+    v_bg = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[2], vl);              \
+    v_br = __riscv_vmv_v_x_u16m2(yuvconst->kRGBCoeffBias[3], vl);              \
+  }
+
+// Read [VLEN/8] Y, [VLEN/(8 * 2)] U and [VLEN/(8 * 2)] V from 422
+#define READYUV422(vl, v_u, v_v, v_y_16)                \
+  {                                                     \
+    vuint8mf2_t v_tmp0, v_tmp1;                         \
+    vuint8m1_t v_y;                                     \
+    vuint16m1_t v_u_16, v_v_16;                         \
+    vl = __riscv_vsetvl_e8mf2((w + 1) / 2);             \
+    v_tmp0 = __riscv_vle8_v_u8mf2(src_u, vl);           \
+    v_u_16 = __riscv_vwaddu_vx_u16m1(v_tmp0, 0, vl);    \
+    v_tmp1 = __riscv_vle8_v_u8mf2(src_v, vl);           \
+    v_v_16 = __riscv_vwaddu_vx_u16m1(v_tmp1, 0, vl);    \
+    v_v_16 = __riscv_vmul_vx_u16m1(v_v_16, 0x0101, vl); \
+    v_u_16 = __riscv_vmul_vx_u16m1(v_u_16, 0x0101, vl); \
+    v_v = __riscv_vreinterpret_v_u16m1_u8m1(v_v_16);    \
+    v_u = __riscv_vreinterpret_v_u16m1_u8m1(v_u_16);    \
+    vl = __riscv_vsetvl_e8m1(w);                        \
+    v_y = __riscv_vle8_v_u8m1(src_y, vl);               \
+    v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl);       \
+  }
+
+// Convert from YUV to fixed point RGB
+#define YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, \
+                 v_g_16, v_b_16, v_r_16)                                   \
+  {                                                                        \
+    vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                    \
+    vuint32m4_t v_tmp5;                                                    \
+    v_tmp0 = __riscv_vwmulu_vv_u16m2(v_u, v_ug, vl);                       \
+    v_y_16 = __riscv_vmul_vx_u16m2(v_y_16, 0x0101, vl);                    \
+    v_tmp0 = __riscv_vwmaccu_vv_u16m2(v_tmp0, v_vg, v_v, vl);              \
+    v_tmp1 = __riscv_vwmulu_vv_u16m2(v_u, v_ub, vl);                       \
+    v_tmp5 = __riscv_vwmulu_vv_u32m4(v_y_16, v_yg, vl);                    \
+    v_tmp2 = __riscv_vnsrl_wx_u16m2(v_tmp5, 16, vl);                       \
+    v_tmp3 = __riscv_vadd_vv_u16m2(v_tmp2, v_bg, vl);                      \
+    v_tmp4 = __riscv_vadd_vv_u16m2(v_tmp2, v_tmp1, vl);                    \
+    v_tmp2 = __riscv_vwmaccu_vv_u16m2(v_tmp2, v_vr, v_v, vl);              \
+    v_g_16 = __riscv_vssubu_vv_u16m2(v_tmp3, v_tmp0, vl);                  \
+    v_b_16 = __riscv_vssubu_vv_u16m2(v_tmp4, v_bb, vl);                    \
+    v_r_16 = __riscv_vssubu_vv_u16m2(v_tmp2, v_br, vl);                    \
+  }
+
+// Convert from fixed point RGB To 8 bit RGB
+#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
+  {                                                          \
+    v_g = __riscv_vnclipu_wx_u8m1(v_g_16, 6, vl);            \
+    v_b = __riscv_vnclipu_wx_u8m1(v_b_16, 6, vl);            \
+    v_r = __riscv_vnclipu_wx_u8m1(v_r_16, 6, vl);            \
+  }
+
 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
  size_t avl = (size_t)4 * width;
  do {
@ -186,6 +252,95 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
  } while (w > 0);
 }

+void I422ToARGBRow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  vuint8m1_t v_u, v_v;
+  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+  vuint8m1_t v_b, v_g, v_r, v_a;
+  vuint16m2_t v_yg, v_bb, v_bg, v_br;
+  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+                 v_br);
+  v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_argb += vl * 4;
+  } while (w > 0);
+}
+
+void I422ToRGBARow_RVV(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgba,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  vuint8m1_t v_u, v_v;
+  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+  vuint8m1_t v_b, v_g, v_r, v_a;
+  vuint16m2_t v_yg, v_bb, v_bg, v_br;
+  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+                 v_br);
+  v_a = __riscv_vmv_v_x_u8m1(255u, vl);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgba += vl * 4;
+  } while (w > 0);
+}
+
+void I422ToRGB24Row_RVV(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgb24,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
+  size_t vl;
+  size_t w = (size_t)width;
+  vuint8m1_t v_u, v_v;
+  vuint8m1_t v_ub, v_vr, v_ug, v_vg;
+  vuint8m1_t v_b, v_g, v_r;
+  vuint16m2_t v_yg, v_bb, v_bg, v_br;
+  vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
+  YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
+                 v_br);
+  do {
+    READYUV422(vl, v_u, v_v, v_y_16);
+    YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16,
+             v_b_16, v_r_16);
+    RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
+    __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
+    w -= vl;
+    src_y += vl;
+    src_u += vl / 2;
+    src_v += vl / 2;
+    dst_rgb24 += vl * 3;
+  } while (w > 0);
+}
+
 void SplitRGBRow_RVV(const uint8_t* src_rgb,
                     uint8_t* dst_r,
                     uint8_t* dst_g,
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -675,6 +675,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
    }
  }
 #endif
+#if defined(HAS_I422TOARGBROW_RVV)
+  if (TestCpuFlag(kCpuHasRVV)) {
+    I422ToARGBRow = I422ToARGBRow_RVV;
+  }
+#endif

  void (*InterpolateRow)(uint8_t* dst_argb, const uint8_t* src_argb,
                         ptrdiff_t src_stride, int dst_width,