mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
[RVV] Enable ScaleRowDown34_RVV & ScaleRowDown34_{0,1}_Box_RVV
Run on SiFive internal FPGA: Test case RVV function Speedup I444ScaleDownBy3by4_None ScaleRowDown34_RVV 5.8 I444ScaleDownBy3by4_Linear ScaleRowDown34_0/1_Box_RVV 6.5 I444ScaleDownBy3by4_Bilinear ScaleRowDown34_0/1_Box_RVV 6.3 Bug: libyuv:956 Change-Id: I8ef221ab14d631e14f1ba1aaa25d2b30d4e710db Signed-off-by: Darren Hsieh <darren.hsieh@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4607777 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
2a5d7e2fbc
commit
552571e8b2
@ -180,6 +180,7 @@ extern "C" {
|
|||||||
#define HAS_SCALEARGBROWDOWN2_RVV
|
#define HAS_SCALEARGBROWDOWN2_RVV
|
||||||
#define HAS_SCALEARGBROWDOWNEVEN_RVV
|
#define HAS_SCALEARGBROWDOWNEVEN_RVV
|
||||||
#define HAS_SCALEROWDOWN2_RVV
|
#define HAS_SCALEROWDOWN2_RVV
|
||||||
|
#define HAS_SCALEROWDOWN34_RVV
|
||||||
#define HAS_SCALEROWDOWN4_RVV
|
#define HAS_SCALEROWDOWN4_RVV
|
||||||
#define HAS_SCALEUVROWDOWN2_RVV
|
#define HAS_SCALEUVROWDOWN2_RVV
|
||||||
#define HAS_SCALEUVROWDOWN2LINEAR_RVV
|
#define HAS_SCALEUVROWDOWN2LINEAR_RVV
|
||||||
@ -1822,6 +1823,18 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width);
|
int dst_width);
|
||||||
|
void ScaleRowDown34_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width);
|
||||||
|
void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width);
|
||||||
|
void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
|
|||||||
@ -485,6 +485,17 @@ static void ScalePlaneDown34(int src_width,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_SCALEROWDOWN34_RVV)
|
||||||
|
if (TestCpuFlag(kCpuHasRVV)) {
|
||||||
|
if (!filtering) {
|
||||||
|
ScaleRowDown34_0 = ScaleRowDown34_RVV;
|
||||||
|
ScaleRowDown34_1 = ScaleRowDown34_RVV;
|
||||||
|
} else {
|
||||||
|
ScaleRowDown34_0 = ScaleRowDown34_0_Box_RVV;
|
||||||
|
ScaleRowDown34_1 = ScaleRowDown34_1_Box_RVV;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < dst_height - 2; y += 3) {
|
for (y = 0; y < dst_height - 2; y += 3) {
|
||||||
ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
|
ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
|
||||||
|
|||||||
@ -330,6 +330,142 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
|
|||||||
} while (w > 0);
|
} while (w > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ScaleRowDown34_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width) {
|
||||||
|
size_t w = (size_t)dst_width / 3u;
|
||||||
|
do {
|
||||||
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
||||||
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
|
||||||
|
__riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
|
||||||
|
w -= vl;
|
||||||
|
src_ptr += 4 * vl;
|
||||||
|
dst_ptr += 3 * vl;
|
||||||
|
} while (w > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width) {
|
||||||
|
size_t w = (size_t)dst_width / 3u;
|
||||||
|
const uint8_t* s = src_ptr;
|
||||||
|
const uint8_t* t = src_ptr + src_stride;
|
||||||
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
|
asm volatile("csrwi vxrm, 0");
|
||||||
|
do {
|
||||||
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
|
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
|
||||||
|
vuint8m2_t v_u0, v_u1, v_u2, v_u3;
|
||||||
|
vuint16m4_t v_u1_u16;
|
||||||
|
vuint8m2_t v_a0, v_a1, v_a2;
|
||||||
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
|
||||||
|
|
||||||
|
if (src_stride == 0) {
|
||||||
|
v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
|
||||||
|
v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
|
||||||
|
v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
|
||||||
|
v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
|
||||||
|
} else {
|
||||||
|
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
|
||||||
|
v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
|
||||||
|
v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
|
||||||
|
v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
|
||||||
|
v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
|
||||||
|
t += 4 * vl;
|
||||||
|
}
|
||||||
|
|
||||||
|
v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
|
||||||
|
v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
|
||||||
|
v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
|
||||||
|
v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
|
||||||
|
|
||||||
|
// Use round-to-nearest-up mode for vnclip & averaging add
|
||||||
|
v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
|
||||||
|
v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
|
||||||
|
v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
|
||||||
|
v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
|
||||||
|
|
||||||
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
|
v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
|
||||||
|
v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
|
||||||
|
v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
|
||||||
|
|
||||||
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
|
v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
|
||||||
|
|
||||||
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
|
v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
|
||||||
|
v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
|
||||||
|
v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
|
||||||
|
|
||||||
|
__riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
|
||||||
|
|
||||||
|
w -= vl;
|
||||||
|
s += 4 * vl;
|
||||||
|
dst_ptr += 3 * vl;
|
||||||
|
} while (w > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
||||||
|
ptrdiff_t src_stride,
|
||||||
|
uint8_t* dst_ptr,
|
||||||
|
int dst_width) {
|
||||||
|
size_t w = (size_t)dst_width / 3u;
|
||||||
|
const uint8_t* s = src_ptr;
|
||||||
|
const uint8_t* t = src_ptr + src_stride;
|
||||||
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
|
asm volatile("csrwi vxrm, 0");
|
||||||
|
do {
|
||||||
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
|
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
|
||||||
|
vuint16m4_t v_u1_u16;
|
||||||
|
vuint8m2_t v_a0, v_a1, v_a2;
|
||||||
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
|
||||||
|
|
||||||
|
// Use round-to-nearest-up mode for vnclip & averaging add
|
||||||
|
if (src_stride == 0) {
|
||||||
|
v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
|
||||||
|
v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
|
||||||
|
v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
|
||||||
|
v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
|
||||||
|
} else {
|
||||||
|
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
|
||||||
|
v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
|
||||||
|
v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
|
||||||
|
v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
|
||||||
|
v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
|
||||||
|
t += 4 * vl;
|
||||||
|
}
|
||||||
|
// a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
|
||||||
|
v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
|
||||||
|
v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
|
||||||
|
v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
|
||||||
|
|
||||||
|
// a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
|
||||||
|
v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
|
||||||
|
|
||||||
|
// a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
|
||||||
|
v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
|
||||||
|
v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
|
||||||
|
v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
|
||||||
|
|
||||||
|
__riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
|
||||||
|
|
||||||
|
w -= vl;
|
||||||
|
s += 4 * vl;
|
||||||
|
dst_ptr += 3 * vl;
|
||||||
|
} while (w > 0);
|
||||||
|
}
|
||||||
|
|
||||||
void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
|
void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
|
||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user