Add Split/Merge RGB/ARGB/XRGB Row_RVV

* Run on SiFive internal FPGA:

SplitRGBPlane_Opt (~6.87x vs scalar)

SplitARGBPlane_Opt (~10.77x vs scalar)

SplitXRGBPlane_Opt (~18.69x vs scalar)

MergeRGBPlane_Opt (~3.63x vs scalar)

MergeARGBPlane_Opt (~3.50x vs scalar)

MergeXRGBPlane_Opt (~2.90x vs scalar)

LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10

- include a fix to avoid implict conversion warning between size_t & int.

Bug: libyuv:956

Change-Id: Icd79b282b04ea3981e7fd4e6d547da6708d82516
Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4443411
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Darren Hsieh 2023-04-11 00:05:48 -07:00 committed by libyuv LUCI CQ
parent 7c6a7e5737
commit 1b3c4c12d4
3 changed files with 251 additions and 33 deletions

View File

@ -764,6 +764,12 @@ extern "C" {
#define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTOAR64ROW_RVV
#define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORAWROW_RVV
#define HAS_ARGBTORGB24ROW_RVV #define HAS_ARGBTORGB24ROW_RVV
#define HAS_MERGEARGBROW_RVV
#define HAS_MERGERGBROW_RVV
#define HAS_MERGEXRGBROW_RVV
#define HAS_SPLITARGBROW_RVV
#define HAS_SPLITRGBROW_RVV
#define HAS_SPLITXRGBROW_RVV
#define HAS_RAWTOARGBROW_RVV #define HAS_RAWTOARGBROW_RVV
#define HAS_RAWTORGB24ROW_RVV #define HAS_RAWTORGB24ROW_RVV
#define HAS_RAWTORGBAROW_RVV #define HAS_RAWTORGBAROW_RVV
@ -2282,6 +2288,11 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width); int width);
void SplitRGBRow_RVV(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_r, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_g,
@ -2308,6 +2319,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_rgb, uint8_t* dst_rgb,
int width); int width);
void MergeRGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width);
void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,
@ -2342,6 +2358,12 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_a, const uint8_t* src_a,
uint8_t* dst_argb, uint8_t* dst_argb,
int width); int width);
void MergeARGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,
@ -2390,6 +2412,12 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_b, uint8_t* dst_b,
uint8_t* dst_a, uint8_t* dst_a,
int width); int width);
void SplitARGBRow_RVV(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_r, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_g,
@ -2434,6 +2462,11 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_argb, uint8_t* dst_argb,
int width); int width);
void MergeXRGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* u_buf, const uint8_t* u_buf,
const uint8_t* v_buf, const uint8_t* v_buf,
@ -2474,6 +2507,11 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width); int width);
void SplitXRGBRow_RVV(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_r, uint8_t* dst_r,
uint8_t* dst_g, uint8_t* dst_g,

View File

@ -1268,6 +1268,11 @@ void SplitRGBPlane(const uint8_t* src_rgb,
} }
} }
#endif #endif
#if defined(HAS_SPLITRGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
SplitRGBRow = SplitRGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
// Copy a row of RGB. // Copy a row of RGB.
@ -1327,6 +1332,11 @@ void MergeRGBPlane(const uint8_t* src_r,
} }
} }
#endif #endif
#if defined(HAS_MERGERGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
MergeRGBRow = MergeRGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of RGB. // Merge a row of U and V into a row of RGB.
@ -1358,6 +1368,9 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
assert(height > 0); assert(height > 0);
if (width <= 0 || height == 0) {
return;
}
if (src_stride_argb == width * 4 && dst_stride_r == width && if (src_stride_argb == width * 4 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) {
width *= height; width *= height;
@ -1398,6 +1411,11 @@ static void SplitARGBPlaneAlpha(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_SPLITARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
SplitARGBRow = SplitARGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width);
@ -1425,6 +1443,9 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
uint8_t* dst_b, int width) = SplitXRGBRow_C; uint8_t* dst_b, int width) = SplitXRGBRow_C;
assert(height > 0); assert(height > 0);
if (width <= 0 || height == 0) {
return;
}
if (src_stride_argb == width * 4 && dst_stride_r == width && if (src_stride_argb == width * 4 && dst_stride_r == width &&
dst_stride_g == width && dst_stride_b == width) { dst_stride_g == width && dst_stride_b == width) {
width *= height; width *= height;
@ -1464,6 +1485,11 @@ static void SplitARGBPlaneOpaque(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_SPLITXRGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
SplitXRGBRow = SplitXRGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width);
@ -1530,6 +1556,9 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r,
assert(height > 0); assert(height > 0);
if (width <= 0 || height == 0) {
return;
}
if (src_stride_r == width && src_stride_g == width && src_stride_b == width && if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
src_stride_a == width && dst_stride_argb == width * 4) { src_stride_a == width && dst_stride_argb == width * 4) {
width *= height; width *= height;
@ -1561,6 +1590,11 @@ static void MergeARGBPlaneAlpha(const uint8_t* src_r,
} }
} }
#endif #endif
#if defined(HAS_MERGEARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
MergeARGBRow = MergeARGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width);
@ -1590,6 +1624,9 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r,
assert(height > 0); assert(height > 0);
if (width <= 0 || height == 0) {
return;
}
if (src_stride_r == width && src_stride_g == width && src_stride_b == width && if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
dst_stride_argb == width * 4) { dst_stride_argb == width * 4) {
width *= height; width *= height;
@ -1620,6 +1657,11 @@ static void MergeARGBPlaneOpaque(const uint8_t* src_r,
} }
} }
#endif #endif
#if defined(HAS_MERGEXRGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
MergeXRGBRow = MergeXRGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); MergeXRGBRow(src_r, src_g, src_b, dst_argb, width);

View File

@ -99,85 +99,223 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
} }
void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
size_t vl = __riscv_vsetvl_e8m2(width); size_t w = (size_t)width;
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
do { do {
vuint8m2_t v_b, v_g, v_r; vuint8m2_t v_b, v_g, v_r;
__riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
__riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
width -= vl; w -= vl;
src_raw += (3 * vl); src_raw += vl * 3;
dst_argb += (4 * vl); dst_argb += vl * 4;
vl = __riscv_vsetvl_e8m2(width); vl = __riscv_vsetvl_e8m2(w);
} while (width > 0); } while (w > 0);
} }
void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
size_t vl = __riscv_vsetvl_e8m2(width); size_t w = (size_t)width;
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
do { do {
vuint8m2_t v_b, v_g, v_r; vuint8m2_t v_b, v_g, v_r;
__riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl); __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
__riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl); __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
width -= vl; w -= vl;
src_raw += (3 * vl); src_raw += vl * 3;
dst_rgba += (4 * vl); dst_rgba += vl * 4;
vl = __riscv_vsetvl_e8m2(width); vl = __riscv_vsetvl_e8m2(w);
} while (width > 0); } while (w > 0);
} }
void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
size_t w = (size_t)width;
do { do {
vuint8m2_t v_b, v_g, v_r; vuint8m2_t v_b, v_g, v_r;
size_t vl = __riscv_vsetvl_e8m2(width); size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl); __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
__riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl); __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
width -= vl; w -= vl;
src_raw += (3 * vl); src_raw += vl * 3;
dst_rgb24 += (3 * vl); dst_rgb24 += vl * 3;
} while (width > 0); } while (w > 0);
} }
void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
size_t w = (size_t)width;
do { do {
vuint8m2_t v_b, v_g, v_r, v_a; vuint8m2_t v_b, v_g, v_r, v_a;
size_t vl = __riscv_vsetvl_e8m2(width); size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
__riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl); __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
width -= vl; w -= vl;
src_argb += (4 * vl); src_argb += vl * 4;
dst_raw += (3 * vl); dst_raw += vl * 3;
} while (width > 0); } while (w > 0);
} }
void ARGBToRGB24Row_RVV(const uint8_t* src_argb, void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
int width) { int width) {
size_t w = (size_t)width;
do { do {
vuint8m2_t v_b, v_g, v_r, v_a; vuint8m2_t v_b, v_g, v_r, v_a;
size_t vl = __riscv_vsetvl_e8m2(width); size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
__riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl); __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
width -= vl; w -= vl;
src_argb += (4 * vl); src_argb += vl * 4;
dst_rgb24 += (3 * vl); dst_rgb24 += vl * 3;
} while (width > 0); } while (w > 0);
} }
void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
size_t vl = __riscv_vsetvl_e8m2(width); size_t w = (size_t)width;
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
do { do {
vuint8m2_t v_b, v_g, v_r; vuint8m2_t v_b, v_g, v_r;
__riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl); __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
__riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
width -= vl; w -= vl;
src_rgb24 += (3 * vl); src_rgb24 += vl * 3;
dst_argb += (4 * vl); dst_argb += vl * 4;
vl = __riscv_vsetvl_e8m2(width); vl = __riscv_vsetvl_e8m2(w);
} while (width > 0); } while (w > 0);
}
void SplitRGBRow_RVV(const uint8_t* src_rgb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
size_t w = (size_t)width;
do {
vuint8m2_t v_b, v_g, v_r;
size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
w -= vl;
dst_r += vl;
dst_g += vl;
dst_b += vl;
src_rgb += vl * 3;
} while (w > 0);
}
void MergeRGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width) {
size_t w = (size_t)width;
do {
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
__riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
w -= vl;
src_r += vl;
src_g += vl;
src_b += vl;
dst_rgb += vl * 3;
} while (w > 0);
}
void SplitARGBRow_RVV(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
size_t w = (size_t)width;
do {
vuint8m2_t v_b, v_g, v_r, v_a;
size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
__riscv_vse8_v_u8m2(dst_a, v_a, vl);
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
w -= vl;
dst_a += vl;
dst_r += vl;
dst_g += vl;
dst_b += vl;
src_argb += vl * 4;
} while (w > 0);
}
void MergeARGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
size_t w = (size_t)width;
do {
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
__riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
w -= vl;
src_r += vl;
src_g += vl;
src_b += vl;
src_a += vl;
dst_argb += vl * 4;
} while (w > 0);
}
void SplitXRGBRow_RVV(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
size_t w = (size_t)width;
do {
vuint8m2_t v_b, v_g, v_r, v_a;
size_t vl = __riscv_vsetvl_e8m2(w);
__riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
w -= vl;
dst_r += vl;
dst_g += vl;
dst_b += vl;
src_argb += vl * 4;
} while (w > 0);
}
void MergeXRGBRow_RVV(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
size_t w = (size_t)width;
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
do {
vuint8m2_t v_r, v_g, v_b;
v_r = __riscv_vle8_v_u8m2(src_r, vl);
v_g = __riscv_vle8_v_u8m2(src_g, vl);
v_b = __riscv_vle8_v_u8m2(src_b, vl);
__riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
w -= vl;
src_r += vl;
src_g += vl;
src_b += vl;
dst_argb += vl * 4;
vl = __riscv_vsetvl_e8m2(w);
} while (w > 0);
} }
#ifdef __cplusplus #ifdef __cplusplus