mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[RVV] Optimize ScaleARGBFilterCols with RVV
* Run on SiFive internal FPGA: Test Case Speedup ARGBScaleDownBy3by8_Linear x2.05 ARGBScaleDownBy3by8_Bilinear x1.76 ARGBScaleDownBy3by8_Box x1.76 Bug: 42280924 Co-Developed-by: Bruce Lai <bruce.lai@sifive.com> Change-Id: Ib9979b1f2ca92d2ef5aa373f9b2459c246ded6c8 Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5103572 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: Bruce Lai <bruce.lai@sifive.com> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
cce8950816
commit
b5a18f9d93
@ -166,6 +166,9 @@ extern "C" {
|
||||
#define HAS_SCALEUVROWDOWN4_RVV
|
||||
#endif
|
||||
#define HAS_SCALEUVROWDOWNEVEN_RVV
|
||||
#if __riscv_v_intrinsic == 11000
|
||||
#define HAS_SCALEARGBFILTERCOLS_RVV
|
||||
#endif
|
||||
#define HAS_SCALEARGBROWDOWN2_RVV
|
||||
#define HAS_SCALEARGBROWDOWN2BOX_RVV
|
||||
#define HAS_SCALEARGBROWDOWN2LINEAR_RVV
|
||||
@ -938,6 +941,11 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
void ScaleARGBFilterCols_RVV(uint8_t* dst_argb,
|
||||
const uint8_t* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx);
|
||||
|
||||
// ARGB Row functions
|
||||
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
||||
|
||||
@ -434,6 +434,12 @@ static int ScaleARGBBilinearDown(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
|
||||
// Allocate a row of ARGB.
|
||||
{
|
||||
@ -571,6 +577,11 @@ static int ScaleARGBBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
|
||||
if (filtering && TestCpuFlag(kCpuHasRVV)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
@ -854,6 +865,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBFILTERCOLS_RVV)
|
||||
if (filtering && TestCpuFlag(kCpuHasRVV)) {
|
||||
ScaleARGBFilterCols = ScaleARGBFilterCols_RVV;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEARGBCOLS_SSE2)
|
||||
if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
|
||||
ScaleARGBFilterCols = ScaleARGBCols_SSE2;
|
||||
|
||||
@ -28,6 +28,55 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEARGBFILTERCOLS_RVV
|
||||
void ScaleARGBFilterCols_RVV(uint8_t* dst_argb,
|
||||
const uint8_t* src_argb,
|
||||
int dst_width,
|
||||
int x,
|
||||
int dx) {
|
||||
assert(x >= 0);
|
||||
|
||||
size_t vl = __riscv_vsetvl_e32m4(dst_width);
|
||||
vuint32m4_t vx = __riscv_vmv_v_x_u32m4(x, vl);
|
||||
vx = __riscv_vmacc_vx_u32m4(vx, dx, __riscv_vid_v_u32m4(vl), vl);
|
||||
do {
|
||||
vuint32m4_t v0_argb, v1_argb;
|
||||
vuint32m4_t v_xf0_u32, v_xf1_u32;
|
||||
vuint8m4_t v0_argb_u8, v1_argb_u8, v_xf0_u8, v_xf1_u8;
|
||||
vuint16m8_t _v0_argb_u16, v_row_u16;
|
||||
// idx is x >> 16
|
||||
vuint32m4_t v_xi_bindex = __riscv_vsrl_vx_u32m4(vx, 14, vl);
|
||||
v_xi_bindex = __riscv_vand_vx_u32m4(v_xi_bindex, ~3u, vl);
|
||||
// Read Packed ARGB w/ byte index.
|
||||
__riscv_vluxseg2ei32_v_u32m4(&v0_argb, &v1_argb, (const uint32_t*)src_argb,
|
||||
v_xi_bindex, vl);
|
||||
// xf = (x >> 9) & 0x7f;
|
||||
v_xf0_u32 = __riscv_vsrl_vx_u32m4(vx, 9, vl);
|
||||
v_xf0_u32 = __riscv_vand_vx_u32m4(v_xf0_u32, 0x7f, vl);
|
||||
vx = __riscv_vadd_vx_u32m4(vx, vl * dx, vl);
|
||||
// duplicate v_xf0_u32[i] from {0,0,0,f[i]} to {f[i],f[i],f[i],f[i]}
|
||||
v_xf0_u32 = __riscv_vmul_vx_u32m4(v_xf0_u32, 0x01010101, vl);
|
||||
// TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607.
|
||||
v_xf1_u32 = __riscv_vxor_vx_u32m4(v_xf0_u32, 0x7f7f7f7f, vl);
|
||||
|
||||
v0_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v0_argb);
|
||||
v1_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v1_argb);
|
||||
v_xf0_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf0_u32);
|
||||
v_xf1_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf1_u32);
|
||||
// ((a) * (0x7f ^ f) + (b)*f) >> 7
|
||||
_v0_argb_u16 = __riscv_vwmulu_vv_u16m8(v0_argb_u8, v_xf1_u8, 4 * vl);
|
||||
v_row_u16 =
|
||||
__riscv_vwmaccu_vv_u16m8(_v0_argb_u16, v1_argb_u8, v_xf0_u8, 4 * vl);
|
||||
|
||||
__riscv_vse8_v_u8m4(dst_argb, __riscv_vnsrl_wx_u8m4(v_row_u16, 7, 4 * vl),
|
||||
4 * vl);
|
||||
dst_width -= vl;
|
||||
dst_argb += 4 * vl;
|
||||
vl = __riscv_vsetvl_e32m4(dst_width);
|
||||
} while (dst_width > 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEADDROW_RVV
|
||||
void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
|
||||
size_t w = (size_t)src_width;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user