Enable I444To{ARGB,RGB24}Row_RVV

Run on SiFive internal FPGA:

I444ToARGB_Opt (~16x vs scalar)
I444ToRGB24_Opt (~10x vs scalar)

LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=10

Change-Id: Idae7dc46ef648beaa14b58ba3eb56b67b17c9b3b
Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4520761
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Darren Hsieh 2023-05-09 01:39:06 -07:00 committed by libyuv LUCI CQ
parent 964d963afb
commit 497ea35688
3 changed files with 137 additions and 23 deletions

View File

@ -785,6 +785,8 @@ extern "C" {
#define HAS_I422TOARGBROW_RVV #define HAS_I422TOARGBROW_RVV
#define HAS_I422TORGB24ROW_RVV #define HAS_I422TORGB24ROW_RVV
#define HAS_I422TORGBAROW_RVV #define HAS_I422TORGBAROW_RVV
#define HAS_I444TOARGBROW_RVV
#define HAS_I444TORGB24ROW_RVV
#define HAS_MERGEARGBROW_RVV #define HAS_MERGEARGBROW_RVV
#define HAS_MERGERGBROW_RVV #define HAS_MERGERGBROW_RVV
#define HAS_MERGEXRGBROW_RVV #define HAS_MERGEXRGBROW_RVV
@ -1062,6 +1064,18 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I444ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I444ToRGB24Row_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_RVV(const uint8_t* src_y, void I422ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,

View File

@ -625,6 +625,11 @@ int I444ToARGBMatrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToARGBRow = I444ToARGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -855,6 +860,11 @@ int I444ToRGB24Matrix(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TORGB24ROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToRGB24Row = I444ToRGB24Row_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); I444ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@ -5913,6 +5923,11 @@ static int I420ToARGBMatrixBilinear(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToARGBRow = I444ToARGBRow_RVV;
}
#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) #if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
@ -6047,6 +6062,11 @@ static int I422ToARGBMatrixLinear(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToARGBRow = I444ToARGBRow_RVV;
}
#endif
#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;
@ -6159,6 +6179,11 @@ static int I420ToRGB24MatrixBilinear(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TORGB24ROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToRGB24Row = I444ToRGB24Row_RVV;
}
#endif
#if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) #if defined(HAS_SCALEROWUP2_BILINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
@ -7625,6 +7650,11 @@ static int I422ToRGB24MatrixLinear(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_I444TORGB24ROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
I444ToRGB24Row = I444ToRGB24Row_RVV;
}
#endif
#if defined(HAS_SCALEROWUP2_LINEAR_SSE2) #if defined(HAS_SCALEROWUP2_LINEAR_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2;

View File

@ -65,9 +65,20 @@ extern "C" {
v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \ v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
} }
// Read [VLEN/8] Y, [VLEN/8] U, and [VLEN/8] V from 444
#define READYUV444(vl, v_u, v_v, v_y_16) \
{ \
vuint8m1_t v_y; \
vl = __riscv_vsetvl_e8m1(w); \
v_y = __riscv_vle8_v_u8m1(src_y, vl); \
v_u = __riscv_vle8_v_u8m1(src_u, vl); \
v_v = __riscv_vle8_v_u8m1(src_v, vl); \
v_y_16 = __riscv_vwaddu_vx_u16m2(v_y, 0, vl); \
}
// Convert from YUV to fixed point RGB // Convert from YUV to fixed point RGB
#define YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, \ #define YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, \
v_g_16, v_b_16, v_r_16) \ v_y_16, v_g_16, v_b_16, v_r_16) \
{ \ { \
vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \ vuint16m2_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
vuint32m4_t v_tmp5; \ vuint32m4_t v_tmp5; \
@ -252,6 +263,65 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
} while (w > 0); } while (w > 0);
} }
void I444ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
size_t vl;
size_t w = (size_t)width;
vuint8m1_t v_u, v_v;
vuint8m1_t v_ub, v_vr, v_ug, v_vg;
vuint8m1_t v_b, v_g, v_r, v_a;
vuint16m2_t v_yg, v_bb, v_bg, v_br;
vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
v_br);
v_a = __riscv_vmv_v_x_u8m1(255u, vl);
do {
READYUV444(vl, v_u, v_v, v_y_16);
YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
w -= vl;
src_y += vl;
src_u += vl;
src_v += vl;
dst_argb += vl * 4;
} while (w > 0);
}
void I444ToRGB24Row_RVV(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
size_t vl;
size_t w = (size_t)width;
vuint8m1_t v_u, v_v;
vuint8m1_t v_ub, v_vr, v_ug, v_vg;
vuint8m1_t v_b, v_g, v_r;
vuint16m2_t v_yg, v_bb, v_bg, v_br;
vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16;
YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg,
v_br);
do {
READYUV444(vl, v_u, v_v, v_y_16);
YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
w -= vl;
src_y += vl;
src_u += vl;
src_v += vl;
dst_rgb24 += vl * 3;
} while (w > 0);
}
void I422ToARGBRow_RVV(const uint8_t* src_y, void I422ToARGBRow_RVV(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
@ -270,8 +340,8 @@ void I422ToARGBRow_RVV(const uint8_t* src_y,
v_a = __riscv_vmv_v_x_u8m1(255u, vl); v_a = __riscv_vmv_v_x_u8m1(255u, vl);
do { do {
READYUV422(vl, v_u, v_v, v_y_16); READYUV422(vl, v_u, v_v, v_y_16);
YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
v_b_16, v_r_16); v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
w -= vl; w -= vl;
@ -300,8 +370,8 @@ void I422ToRGBARow_RVV(const uint8_t* src_y,
v_a = __riscv_vmv_v_x_u8m1(255u, vl); v_a = __riscv_vmv_v_x_u8m1(255u, vl);
do { do {
READYUV422(vl, v_u, v_v, v_y_16); READYUV422(vl, v_u, v_v, v_y_16);
YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
v_b_16, v_r_16); v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl); __riscv_vsseg4e8_v_u8m1(dst_rgba, v_a, v_b, v_g, v_r, vl);
w -= vl; w -= vl;
@ -329,8 +399,8 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y,
v_br); v_br);
do { do {
READYUV422(vl, v_u, v_v, v_y_16); READYUV422(vl, v_u, v_v, v_y_16);
YUVTORGB(vl, v_u, v_v, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, v_y_16, v_g_16, YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br,
v_b_16, v_r_16); v_y_16, v_g_16, v_b_16, v_r_16);
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
__riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl); __riscv_vsseg3e8_v_u8m1(dst_rgb24, v_b, v_g, v_r, vl);
w -= vl; w -= vl;