[RVV] Optimize ScaleARGBFilterCols with RVV

* Run on SiFive internal FPGA:

Test Case	                Speedup
ARGBScaleDownBy3by8_Linear      x2.05
ARGBScaleDownBy3by8_Bilinear    x1.76
ARGBScaleDownBy3by8_Box         x1.76

Bug: 42280924
Co-Developed-by: Bruce Lai <bruce.lai@sifive.com>
Change-Id: Ib9979b1f2ca92d2ef5aa373f9b2459c246ded6c8
Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5103572
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Darren Hsieh 2023-09-15 02:20:41 -07:00 committed by Frank Barchard
parent cce8950816
commit b5a18f9d93
3 changed files with 73 additions and 0 deletions

View File

@ -166,6 +166,9 @@ extern "C" {
#define HAS_SCALEUVROWDOWN4_RVV
#endif
#define HAS_SCALEUVROWDOWNEVEN_RVV
#if __riscv_v_intrinsic == 11000
#define HAS_SCALEARGBFILTERCOLS_RVV
#endif
#define HAS_SCALEARGBROWDOWN2_RVV
#define HAS_SCALEARGBROWDOWN2BOX_RVV
#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
@ -938,6 +941,11 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
int dst_width,
int x,
int dx);
void ScaleARGBFilterCols_RVV(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx);
// ARGB Row functions
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,

View File

@ -434,6 +434,12 @@ static int ScaleARGBBilinearDown(int src_width,
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
}
#endif
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
// Allocate a row of ARGB.
{
@ -571,6 +577,11 @@ static int ScaleARGBBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
if (filtering && TestCpuFlag(kCpuHasRVV)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@ -854,6 +865,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
if (filtering && TestCpuFlag(kCpuHasRVV)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
}
#endif
#if defined(HAS_SCALEARGBCOLS_SSE2)
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
ScaleARGBFilterCols = ScaleARGBCols_SSE2;

View File

@ -28,6 +28,55 @@ namespace libyuv {
extern "C" {
#endif
#ifdef HAS_SCALEARGBFILTERCOLS_RVV
void ScaleARGBFilterCols_RVV(uint8_t* dst_argb,
const uint8_t* src_argb,
int dst_width,
int x,
int dx) {
assert(x >= 0);
size_t vl = __riscv_vsetvl_e32m4(dst_width);
vuint32m4_t vx = __riscv_vmv_v_x_u32m4(x, vl);
vx = __riscv_vmacc_vx_u32m4(vx, dx, __riscv_vid_v_u32m4(vl), vl);
do {
vuint32m4_t v0_argb, v1_argb;
vuint32m4_t v_xf0_u32, v_xf1_u32;
vuint8m4_t v0_argb_u8, v1_argb_u8, v_xf0_u8, v_xf1_u8;
vuint16m8_t _v0_argb_u16, v_row_u16;
// idx is x >> 16
vuint32m4_t v_xi_bindex = __riscv_vsrl_vx_u32m4(vx, 14, vl);
v_xi_bindex = __riscv_vand_vx_u32m4(v_xi_bindex, ~3u, vl);
// Read Packed ARGB w/ byte index.
__riscv_vluxseg2ei32_v_u32m4(&v0_argb, &v1_argb, (const uint32_t*)src_argb,
v_xi_bindex, vl);
// xf = (x >> 9) & 0x7f;
v_xf0_u32 = __riscv_vsrl_vx_u32m4(vx, 9, vl);
v_xf0_u32 = __riscv_vand_vx_u32m4(v_xf0_u32, 0x7f, vl);
vx = __riscv_vadd_vx_u32m4(vx, vl * dx, vl);
// duplicate v_xf0_u32[i] from {0,0,0,f[i]} to {f[i],f[i],f[i],f[i]}
v_xf0_u32 = __riscv_vmul_vx_u32m4(v_xf0_u32, 0x01010101, vl);
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
v_xf1_u32 = __riscv_vxor_vx_u32m4(v_xf0_u32, 0x7f7f7f7f, vl);
v0_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v0_argb);
v1_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v1_argb);
v_xf0_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf0_u32);
v_xf1_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf1_u32);
// ((a) * (0x7f ^ f) + (b)*f) >> 7
_v0_argb_u16 = __riscv_vwmulu_vv_u16m8(v0_argb_u8, v_xf1_u8, 4 * vl);
v_row_u16 =
__riscv_vwmaccu_vv_u16m8(_v0_argb_u16, v1_argb_u8, v_xf0_u8, 4 * vl);
__riscv_vse8_v_u8m4(dst_argb, __riscv_vnsrl_wx_u8m4(v_row_u16, 7, 4 * vl),
4 * vl);
dst_width -= vl;
dst_argb += 4 * vl;
vl = __riscv_vsetvl_e32m4(dst_width);
} while (dst_width > 0);
}
#endif
#ifdef HAS_SCALEADDROW_RVV
void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
size_t w = (size_t)src_width;