[RVV] Enable ScaleRowDown38_RVV & ScaleRowDown38_{2,3}_Box_RVV

* Run on SiFive internal FPGA:

Test Case			Speedup
I420ScaleDownBy3by8_None	4.2
I420ScaleDownBy3by8_Linear	1.7
I420ScaleDownBy3by8_Bilinear	1.7
I420ScaleDownBy3by8_Box		1.7
I444ScaleDownBy3by8_None	4.2
I444ScaleDownBy3by8_Linear	1.8
I444ScaleDownBy3by8_Bilinear	1.8
I444ScaleDownBy3by8_Box		1.8

Change-Id: Ic2e98de2494d9e7b25f5db115a7f21c618eaefed
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4711857
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Bruce Lai 2023-07-04 22:05:46 -07:00 committed by libyuv LUCI CQ
parent 10de943a12
commit c60ac4025c
3 changed files with 172 additions and 1 deletions

View File

@ -181,6 +181,7 @@ extern "C" {
#define HAS_SCALEARGBROWDOWNEVEN_RVV
#define HAS_SCALEROWDOWN2_RVV
#define HAS_SCALEROWDOWN34_RVV
#define HAS_SCALEROWDOWN38_RVV
#define HAS_SCALEROWDOWN4_RVV
#define HAS_SCALEROWUP2_LINEAR_RVV
#define HAS_SCALEROWUP2_BILINEAR_RVV
@ -1847,6 +1848,19 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width);
void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);

View File

@ -711,6 +711,17 @@ static void ScalePlaneDown38(int src_width,
}
}
#endif
#if defined(HAS_SCALEROWDOWN38_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
if (!filtering) {
ScaleRowDown38_3 = ScaleRowDown38_RVV;
ScaleRowDown38_2 = ScaleRowDown38_RVV;
} else {
ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
}
}
#endif
for (y = 0; y < dst_height - 2; y += 3) {
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

View File

@ -21,8 +21,8 @@
// This module is for clang rvv. GCC hasn't supported segment load & store.
#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
defined(__clang__)
#include <assert.h>
#include <riscv_vector.h>
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@ -463,6 +463,152 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
} while (w > 0);
}
void ScaleRowDown38_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
size_t w = (size_t)dst_width / 3u;
(void)src_stride;
assert(dst_width % 3 == 0);
do {
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
size_t vl = __riscv_vsetvl_e8m1(w);
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
&v_s7, src_ptr, vl);
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
w -= vl;
src_ptr += 8 * vl;
dst_ptr += 3 * vl;
} while (w > 0);
}
void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
size_t w = (size_t)dst_width / 3u;
const uint16_t coeff_a = (65536u / 6u);
const uint16_t coeff_b = (65536u / 4u);
assert((dst_width % 3 == 0) && (dst_width > 0));
do {
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
vuint16m2_t v_e0, v_e1, v_e2, v_e;
vuint16m2_t v_f0, v_f1, v_f2, v_f;
vuint16m2_t v_g0, v_g1, v_g;
vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
size_t vl = __riscv_vsetvl_e8m1(w);
// s: e00, e10, e20, f00, f10, f20, g00, g10
// t: e01, e11, e21, f01, f11, f21, g01, g11
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
&v_s7, src_ptr, vl);
__riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
&v_t7, src_ptr + src_stride, vl);
// Calculate sum of [e00, e21] to v_e
// Calculate sum of [f00, f21] to v_f
// Calculate sum of [g00, g11] to v_g
v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
// Average in 16-bit fixed-point
v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
w -= vl;
src_ptr += 8 * vl;
dst_ptr += 3 * vl;
} while (w > 0);
}
void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
size_t w = (size_t)dst_width / 3u;
const uint16_t coeff_a = (65536u / 9u);
const uint16_t coeff_b = (65536u / 6u);
assert((dst_width % 3 == 0) && (dst_width > 0));
do {
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
vuint16m2_t v_g0, v_g1, v_g2, v_g;
vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
size_t vl = __riscv_vsetvl_e8m1(w);
// s: e00, e10, e20, f00, f10, f20, g00, g10
// t: e01, e11, e21, f01, f11, f21, g01, g11
// u: e02, e12, e22, f02, f12, f22, g02, g12
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
&v_s7, src_ptr, vl);
__riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
&v_t7, src_ptr + src_stride, vl);
__riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
&v_u7, src_ptr + 2 * src_stride, vl);
// Calculate sum of [e00, e22]
v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
// Calculate sum of [f00, f22]
v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
// Calculate sum of [g00, g12]
v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
// Average in 16-bit fixed-point
v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
w -= vl;
src_ptr += 8 * vl;
dst_ptr += 3 * vl;
} while (w > 0);
}
// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
// platforms only implement non-edge part of image and process edge with scalar.