diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 7da122ed1..f5294f082 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -166,6 +166,9 @@ extern "C" { #define HAS_SCALEUVROWDOWN4_RVV #endif #define HAS_SCALEUVROWDOWNEVEN_RVV +#if __riscv_v_intrinsic == 11000 +#define HAS_SCALEARGBFILTERCOLS_RVV +#endif #define HAS_SCALEARGBROWDOWN2_RVV #define HAS_SCALEARGBROWDOWN2BOX_RVV #define HAS_SCALEARGBROWDOWN2LINEAR_RVV @@ -938,6 +941,11 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, int dst_width, int x, int dx); +void ScaleARGBFilterCols_RVV(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); // ARGB Row functions void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 9cfb17988..da99febb9 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -434,6 +434,12 @@ static int ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; + } +#endif + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. { @@ -571,6 +577,11 @@ static int ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_RVV) + if (filtering && TestCpuFlag(kCpuHasRVV)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -854,6 +865,11 @@ static int ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_RVV) + if (filtering && TestCpuFlag(kCpuHasRVV)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_RVV; + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index 6ed58de2f..9fe2b2773 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -28,6 +28,55 @@ namespace libyuv { extern "C" { #endif +#ifdef HAS_SCALEARGBFILTERCOLS_RVV +void ScaleARGBFilterCols_RVV(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + assert(x >= 0); + + size_t vl = __riscv_vsetvl_e32m4(dst_width); + vuint32m4_t vx = __riscv_vmv_v_x_u32m4(x, vl); + vx = __riscv_vmacc_vx_u32m4(vx, dx, __riscv_vid_v_u32m4(vl), vl); + do { + vuint32m4_t v0_argb, v1_argb; + vuint32m4_t v_xf0_u32, v_xf1_u32; + vuint8m4_t v0_argb_u8, v1_argb_u8, v_xf0_u8, v_xf1_u8; + vuint16m8_t _v0_argb_u16, v_row_u16; + // idx is x >> 16 + vuint32m4_t v_xi_bindex = __riscv_vsrl_vx_u32m4(vx, 14, vl); + v_xi_bindex = __riscv_vand_vx_u32m4(v_xi_bindex, ~3u, vl); + // Read Packed ARGB w/ byte index. + __riscv_vluxseg2ei32_v_u32m4(&v0_argb, &v1_argb, (const uint32_t*)src_argb, + v_xi_bindex, vl); + // xf = (x >> 9) & 0x7f; + v_xf0_u32 = __riscv_vsrl_vx_u32m4(vx, 9, vl); + v_xf0_u32 = __riscv_vand_vx_u32m4(v_xf0_u32, 0x7f, vl); + vx = __riscv_vadd_vx_u32m4(vx, vl * dx, vl); + // duplicate v_xf0_u32[i] from {0,0,0,f[i]} to {f[i],f[i],f[i],f[i]} + v_xf0_u32 = __riscv_vmul_vx_u32m4(v_xf0_u32, 0x01010101, vl); + // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. + v_xf1_u32 = __riscv_vxor_vx_u32m4(v_xf0_u32, 0x7f7f7f7f, vl); + + v0_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v0_argb); + v1_argb_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v1_argb); + v_xf0_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf0_u32); + v_xf1_u8 = __riscv_vreinterpret_v_u32m4_u8m4(v_xf1_u32); + // ((a) * (0x7f ^ f) + (b)*f) >> 7 + _v0_argb_u16 = __riscv_vwmulu_vv_u16m8(v0_argb_u8, v_xf1_u8, 4 * vl); + v_row_u16 = + __riscv_vwmaccu_vv_u16m8(_v0_argb_u16, v1_argb_u8, v_xf0_u8, 4 * vl); + + __riscv_vse8_v_u8m4(dst_argb, __riscv_vnsrl_wx_u8m4(v_row_u16, 7, 4 * vl), + 4 * vl); + dst_width -= vl; + dst_argb += 4 * vl; + vl = __riscv_vsetvl_e32m4(dst_width); + } while (dst_width > 0); +} +#endif + #ifdef HAS_SCALEADDROW_RVV void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { size_t w = (size_t)src_width;