mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[RVV] Enable ScaleRowDown38_RVV & ScaleRowDown38_{2,3}_Box_RVV
* Run on SiFive internal FPGA: Test Case Speedup I420ScaleDownBy3by8_None 4.2 I420ScaleDownBy3by8_Linear 1.7 I420ScaleDownBy3by8_Bilinear 1.7 I420ScaleDownBy3by8_Box 1.7 I444ScaleDownBy3by8_None 4.2 I444ScaleDownBy3by8_Linear 1.8 I444ScaleDownBy3by8_Bilinear 1.8 I444ScaleDownBy3by8_Box 1.8 Change-Id: Ic2e98de2494d9e7b25f5db115a7f21c618eaefed Signed-off-by: Bruce Lai <bruce.lai@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4711857 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
10de943a12
commit
c60ac4025c
@ -181,6 +181,7 @@ extern "C" {
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_RVV
|
||||
#define HAS_SCALEROWDOWN2_RVV
|
||||
#define HAS_SCALEROWDOWN34_RVV
|
||||
#define HAS_SCALEROWDOWN38_RVV
|
||||
#define HAS_SCALEROWDOWN4_RVV
|
||||
#define HAS_SCALEROWUP2_LINEAR_RVV
|
||||
#define HAS_SCALEROWUP2_BILINEAR_RVV
|
||||
@ -1847,6 +1848,19 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
@ -711,6 +711,17 @@ static void ScalePlaneDown38(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN38_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_RVV;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_RVV;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_RVV;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_RVV;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < dst_height - 2; y += 3) {
|
||||
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
|
||||
|
||||
@ -21,8 +21,8 @@
|
||||
// This module is for clang rvv. GCC hasn't supported segment load & store.
|
||||
#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
|
||||
defined(__clang__)
|
||||
#include <assert.h>
|
||||
#include <riscv_vector.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
@ -463,6 +463,152 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
||||
} while (w > 0);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
size_t w = (size_t)dst_width / 3u;
|
||||
(void)src_stride;
|
||||
assert(dst_width % 3 == 0);
|
||||
do {
|
||||
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
|
||||
size_t vl = __riscv_vsetvl_e8m1(w);
|
||||
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
|
||||
&v_s7, src_ptr, vl);
|
||||
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
|
||||
w -= vl;
|
||||
src_ptr += 8 * vl;
|
||||
dst_ptr += 3 * vl;
|
||||
} while (w > 0);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
size_t w = (size_t)dst_width / 3u;
|
||||
const uint16_t coeff_a = (65536u / 6u);
|
||||
const uint16_t coeff_b = (65536u / 4u);
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
do {
|
||||
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
|
||||
vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
|
||||
vuint16m2_t v_e0, v_e1, v_e2, v_e;
|
||||
vuint16m2_t v_f0, v_f1, v_f2, v_f;
|
||||
vuint16m2_t v_g0, v_g1, v_g;
|
||||
vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
|
||||
size_t vl = __riscv_vsetvl_e8m1(w);
|
||||
// s: e00, e10, e20, f00, f10, f20, g00, g10
|
||||
// t: e01, e11, e21, f01, f11, f21, g01, g11
|
||||
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
|
||||
&v_s7, src_ptr, vl);
|
||||
__riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
|
||||
&v_t7, src_ptr + src_stride, vl);
|
||||
// Calculate sum of [e00, e21] to v_e
|
||||
// Calculate sum of [f00, f21] to v_f
|
||||
// Calculate sum of [g00, g11] to v_g
|
||||
v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
|
||||
v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
|
||||
v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
|
||||
v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
|
||||
v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
|
||||
v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
|
||||
v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
|
||||
v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
|
||||
|
||||
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
|
||||
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
|
||||
v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
|
||||
v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
|
||||
v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
|
||||
|
||||
// Average in 16-bit fixed-point
|
||||
v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
|
||||
v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
|
||||
v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
|
||||
|
||||
v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
|
||||
v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
|
||||
v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
|
||||
|
||||
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
|
||||
w -= vl;
|
||||
src_ptr += 8 * vl;
|
||||
dst_ptr += 3 * vl;
|
||||
} while (w > 0);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
size_t w = (size_t)dst_width / 3u;
|
||||
const uint16_t coeff_a = (65536u / 9u);
|
||||
const uint16_t coeff_b = (65536u / 6u);
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
do {
|
||||
vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
|
||||
vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
|
||||
vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
|
||||
vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
|
||||
vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
|
||||
vuint16m2_t v_g0, v_g1, v_g2, v_g;
|
||||
vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
|
||||
size_t vl = __riscv_vsetvl_e8m1(w);
|
||||
// s: e00, e10, e20, f00, f10, f20, g00, g10
|
||||
// t: e01, e11, e21, f01, f11, f21, g01, g11
|
||||
// u: e02, e12, e22, f02, f12, f22, g02, g12
|
||||
__riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
|
||||
&v_s7, src_ptr, vl);
|
||||
__riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
|
||||
&v_t7, src_ptr + src_stride, vl);
|
||||
__riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
|
||||
&v_u7, src_ptr + 2 * src_stride, vl);
|
||||
// Calculate sum of [e00, e22]
|
||||
v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
|
||||
v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
|
||||
v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
|
||||
v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
|
||||
v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
|
||||
|
||||
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
|
||||
v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
|
||||
v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
|
||||
v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
|
||||
// Calculate sum of [f00, f22]
|
||||
v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
|
||||
v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
|
||||
v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
|
||||
v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
|
||||
v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
|
||||
|
||||
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
|
||||
v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
|
||||
v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
|
||||
v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
|
||||
// Calculate sum of [g00, g12]
|
||||
v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
|
||||
v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
|
||||
v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
|
||||
|
||||
v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
|
||||
v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
|
||||
|
||||
// Average in 16-bit fixed-point
|
||||
v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
|
||||
v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
|
||||
v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
|
||||
|
||||
v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
|
||||
v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
|
||||
v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
|
||||
__riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
|
||||
w -= vl;
|
||||
src_ptr += 8 * vl;
|
||||
dst_ptr += 3 * vl;
|
||||
} while (w > 0);
|
||||
}
|
||||
|
||||
// ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
|
||||
// ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
|
||||
// platforms only implement non-edge part of image and process edge with scalar.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user