From 44396e6e9aad554283c8f1fbe981ac122c40dfc7 Mon Sep 17 00:00:00 2001 From: Darren Hsieh Date: Sun, 9 Apr 2023 21:34:25 -0700 Subject: [PATCH] Add ARGBToRAWRow_RVV, ARGBToRGB24Row_RVV, RGB24ToARGBRow_RVV * Run on SiFive internal FPGA: ARGBToRAW_Opt (~1.55x vs scalar) ARGBToRGB24_Opt (~1.44x vs scalar) RGB24ToARGB_Opt (~1.77x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 Bug: libyuv:956 Change-Id: I26722f6848cd68684d95d9a7ee06ce0416e7985d Signed-off-by: Darren Hsieh Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4413083 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/row.h | 9 +++++- source/convert_argb.cc | 5 ++++ source/convert_from_argb.cc | 10 +++++++ source/row_rvv.cc | 58 ++++++++++++++++++++++++++++++++----- 4 files changed, 73 insertions(+), 9 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 08004c0cc..6140443b0 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -758,9 +758,12 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv) +#define HAS_ARGBTORAWROW_RVV +#define HAS_ARGBTORGB24ROW_RVV #define HAS_RAWTOARGBROW_RVV -#define HAS_RAWTORGBAROW_RVV #define HAS_RAWTORGB24ROW_RVV +#define HAS_RAWTORGBAROW_RVV +#define HAS_RGB24TOARGBROW_RVV #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -2961,6 +2964,7 @@ void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); @@ -3197,6 +3201,9 @@ void ARGBToARGB4444Row_LASX(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width); +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width); + void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index e25ecefa9..f490e9c13 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3049,6 +3049,11 @@ int RGB24ToARGB(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGB24TOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + RGB24ToARGBRow = RGB24ToARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 55516cbd8..e5608adba 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1487,6 +1487,11 @@ int ARGBToRGB24(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRGB24Row = ARGBToRGB24Row_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -1561,6 +1566,11 @@ int ARGBToRAW(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBToRAWRow = ARGBToRAWRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 0f264d349..629eca465 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -30,33 +30,33 @@ extern "C" { void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t vl = __riscv_vsetvl_e8m2(width); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - while (width > 0) { + do { vuint8m2_t v_b, v_g, v_r; - vl = __riscv_vsetvl_e8m2(width); __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); width -= vl; src_raw += (3 * vl); dst_argb += (4 * vl); - } + vl = __riscv_vsetvl_e8m2(width); + } while (width > 0); } void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t vl = __riscv_vsetvl_e8m2(width); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); - while (width > 0) { + do { vuint8m2_t v_b, v_g, v_r; - vl = __riscv_vsetvl_e8m2(width); __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); width -= vl; src_raw += (3 * vl); dst_rgba += (4 * vl); - } + vl = __riscv_vsetvl_e8m2(width); + } while (width > 0); } void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - while (width > 0) { + do { vuint8m2_t v_b, v_g, v_r; size_t vl = __riscv_vsetvl_e8m2(width); __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); @@ -64,7 +64,49 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { width -= vl; src_raw += (3 * vl); dst_rgb24 += (3 * vl); - } + } while (width > 0); +} + +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(width); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); + width -= vl; + src_argb += (4 * vl); + dst_raw += (3 * vl); + } while (width > 0); +} + +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(width); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); + width -= vl; + src_argb += (4 * vl); + dst_rgb24 += (3 * vl); + } while (width > 0); +} + +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t vl = __riscv_vsetvl_e8m2(width); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_b, v_g, v_r; + __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + width -= vl; + src_rgb24 += (3 * vl); + dst_argb += (4 * vl); + vl = __riscv_vsetvl_e8m2(width); + } while (width > 0); } #ifdef __cplusplus