From 1b3c4c12d4b7972b6656438a37949309bfb2c18a Mon Sep 17 00:00:00 2001 From: Darren Hsieh Date: Tue, 11 Apr 2023 00:05:48 -0700 Subject: [PATCH] Add Split/Merge RGB/ARGB/XRGB Row_RVV * Run on SiFive internal FPGA: SplitRGBPlane_Opt (~6.87x vs scalar) SplitARGBPlane_Opt (~10.77x vs scalar) SplitXRGBPlane_Opt (~18.69x vs scalar) MergeRGBPlane_Opt (~3.63x vs scalar) MergeARGBPlane_Opt (~3.50x vs scalar) MergeXRGBPlane_Opt (~2.90x vs scalar) LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10 - include a fix to avoid implict conversion warning between size_t & int. Bug: libyuv:956 Change-Id: Icd79b282b04ea3981e7fd4e6d547da6708d82516 Signed-off-by: Darren Hsieh Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4443411 Commit-Queue: Frank Barchard Reviewed-by: Frank Barchard --- include/libyuv/row.h | 38 +++++++ source/planar_functions.cc | 42 ++++++++ source/row_rvv.cc | 204 +++++++++++++++++++++++++++++++------ 3 files changed, 251 insertions(+), 33 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 176b37814..53a8d8a6f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -764,6 +764,12 @@ extern "C" { #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORGB24ROW_RVV +#define HAS_MERGEARGBROW_RVV +#define HAS_MERGERGBROW_RVV +#define HAS_MERGEXRGBROW_RVV +#define HAS_SPLITARGBROW_RVV +#define HAS_SPLITRGBROW_RVV +#define HAS_SPLITXRGBROW_RVV #define HAS_RAWTOARGBROW_RVV #define HAS_RAWTORGB24ROW_RVV #define HAS_RAWTORGBAROW_RVV @@ -2282,6 +2288,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -2308,6 +2319,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width); +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width); void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2342,6 +2358,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width); void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2390,6 +2412,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width); +void SplitARGBRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width); void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, @@ -2434,6 +2462,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width); void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2474,6 +2507,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width); +void SplitXRGBRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width); void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b5a2e1a03..c6f9d5c75 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1268,6 +1268,11 @@ void SplitRGBPlane(const uint8_t* src_rgb, } } #endif +#if defined(HAS_SPLITRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitRGBRow = SplitRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Copy a row of RGB. @@ -1327,6 +1332,11 @@ void MergeRGBPlane(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGERGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeRGBRow = MergeRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of RGB. @@ -1358,6 +1368,9 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { width *= height; @@ -1398,6 +1411,11 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb, } } #endif +#if defined(HAS_SPLITARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitARGBRow = SplitARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); @@ -1425,6 +1443,9 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb, uint8_t* dst_b, int width) = SplitXRGBRow_C; assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width) { width *= height; @@ -1464,6 +1485,11 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb, } } #endif +#if defined(HAS_SPLITXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitXRGBRow = SplitXRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); @@ -1530,6 +1556,9 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_argb == width * 4) { width *= height; @@ -1561,6 +1590,11 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGEARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeARGBRow = MergeARGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); @@ -1590,6 +1624,9 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r, assert(height > 0); + if (width <= 0 || height == 0) { + return; + } if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_argb == width * 4) { width *= height; @@ -1620,6 +1657,11 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r, } } #endif +#if defined(HAS_MERGEXRGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeXRGBRow = MergeXRGBRow_RVV; + } +#endif for (y = 0; y < height; ++y) { MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index bd21d44e6..0ca4740b3 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -99,85 +99,223 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { } void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - size_t vl = __riscv_vsetvl_e8m2(width); + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { vuint8m2_t v_b, v_g, v_r; __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - width -= vl; - src_raw += (3 * vl); - dst_argb += (4 * vl); - vl = __riscv_vsetvl_e8m2(width); - } while (width > 0); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); } void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - size_t vl = __riscv_vsetvl_e8m2(width); + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { vuint8m2_t v_b, v_g, v_r; __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); - width -= vl; - src_raw += (3 * vl); - dst_rgba += (4 * vl); - vl = __riscv_vsetvl_e8m2(width); - } while (width > 0); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); } void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; do { vuint8m2_t v_b, v_g, v_r; - size_t vl = __riscv_vsetvl_e8m2(width); + size_t vl = __riscv_vsetvl_e8m2(w); __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); - width -= vl; - src_raw += (3 * vl); - dst_rgb24 += (3 * vl); - } while (width > 0); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); } void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; do { vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(width); + size_t vl = __riscv_vsetvl_e8m2(w); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); - width -= vl; - src_argb += (4 * vl); - dst_raw += (3 * vl); - } while (width > 0); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); } void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; do { vuint8m2_t v_b, v_g, v_r, v_a; - size_t vl = __riscv_vsetvl_e8m2(width); + size_t vl = __riscv_vsetvl_e8m2(w); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); - width -= vl; - src_argb += (4 * vl); - dst_rgb24 += (3 * vl); - } while (width > 0); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); } void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - size_t vl = __riscv_vsetvl_e8m2(width); + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); do { vuint8m2_t v_b, v_g, v_r; __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); - width -= vl; - src_rgb24 += (3 * vl); - dst_argb += (4 * vl); - vl = __riscv_vsetvl_e8m2(width); - } while (width > 0); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} + +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} + +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} + +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} + +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + vuint8m2_t v_b, v_g, v_r, v_a; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} + +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r, v_g, v_b; + v_r = __riscv_vle8_v_u8m2(src_r, vl); + v_g = __riscv_vle8_v_u8m2(src_g, vl); + v_b = __riscv_vle8_v_u8m2(src_b, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); } #ifdef __cplusplus