mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
[RVV] Enable ARGBBlendRow_RVV/BlendPlaneRow_RVV
* Run on SiFive internal FPGA: Test case Speedup ARGBBlend_Opt 4.60 BlendPlane_Opt 5.96 I420Blend_Opt 5.83 - Also, add code to use ScaleRowDown2Box_RVV in I420Blend Change-Id: Icc75e05d26b3427a98269d2a33c4474074033264 Signed-off-by: Bruce Lai <bruce.lai@sifive.com> Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4681100 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
b76fcd4654
commit
d33edd2373
@ -794,6 +794,7 @@ extern "C" {
|
|||||||
#define HAS_AB64TOARGBROW_RVV
|
#define HAS_AB64TOARGBROW_RVV
|
||||||
#define HAS_AR64TOARGBROW_RVV
|
#define HAS_AR64TOARGBROW_RVV
|
||||||
#define HAS_ARGBATTENUATEROW_RVV
|
#define HAS_ARGBATTENUATEROW_RVV
|
||||||
|
#define HAS_ARGBBLENDROW_RVV
|
||||||
#define HAS_ARGBCOPYYTOALPHAROW_RVV
|
#define HAS_ARGBCOPYYTOALPHAROW_RVV
|
||||||
#define HAS_ARGBEXTRACTALPHAROW_RVV
|
#define HAS_ARGBEXTRACTALPHAROW_RVV
|
||||||
#define HAS_ARGBTOAB64ROW_RVV
|
#define HAS_ARGBTOAB64ROW_RVV
|
||||||
@ -805,6 +806,7 @@ extern "C" {
|
|||||||
#define HAS_ABGRTOYROW_RVV
|
#define HAS_ABGRTOYROW_RVV
|
||||||
#define HAS_ABGRTOYJROW_RVV
|
#define HAS_ABGRTOYJROW_RVV
|
||||||
#define HAS_BGRATOYROW_RVV
|
#define HAS_BGRATOYROW_RVV
|
||||||
|
#define HAS_BLENDPLANEROW_RVV
|
||||||
#define HAS_COPYROW_RVV
|
#define HAS_COPYROW_RVV
|
||||||
#define HAS_I400TOARGBROW_RVV
|
#define HAS_I400TOARGBROW_RVV
|
||||||
#define HAS_I422ALPHATOARGBROW_RVV
|
#define HAS_I422ALPHATOARGBROW_RVV
|
||||||
@ -4541,6 +4543,10 @@ void ARGBBlendRow_LSX(const uint8_t* src_argb0,
|
|||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBBlendRow_RVV(const uint8_t* src_argb0,
|
||||||
|
const uint8_t* src_argb1,
|
||||||
|
uint8_t* dst_argb,
|
||||||
|
int width);
|
||||||
void ARGBBlendRow_C(const uint8_t* src_argb,
|
void ARGBBlendRow_C(const uint8_t* src_argb,
|
||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
@ -4567,6 +4573,11 @@ void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
|
|||||||
const uint8_t* v_buf,
|
const uint8_t* v_buf,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int width);
|
int width);
|
||||||
|
void BlendPlaneRow_RVV(const uint8_t* src0,
|
||||||
|
const uint8_t* src1,
|
||||||
|
const uint8_t* alpha,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width);
|
||||||
void BlendPlaneRow_C(const uint8_t* src0,
|
void BlendPlaneRow_C(const uint8_t* src0,
|
||||||
const uint8_t* src1,
|
const uint8_t* src1,
|
||||||
const uint8_t* alpha,
|
const uint8_t* alpha,
|
||||||
|
|||||||
@ -2831,6 +2831,11 @@ int ARGBBlend(const uint8_t* src_argb0,
|
|||||||
if (TestCpuFlag(kCpuHasLSX)) {
|
if (TestCpuFlag(kCpuHasLSX)) {
|
||||||
ARGBBlendRow = ARGBBlendRow_LSX;
|
ARGBBlendRow = ARGBBlendRow_LSX;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_ARGBBLENDROW_RVV)
|
||||||
|
if (TestCpuFlag(kCpuHasRVV)) {
|
||||||
|
ARGBBlendRow = ARGBBlendRow_RVV;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
|
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
|
||||||
@ -2891,6 +2896,11 @@ int BlendPlane(const uint8_t* src_y0,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_BLENDPLANEROW_RVV)
|
||||||
|
if (TestCpuFlag(kCpuHasRVV)) {
|
||||||
|
BlendPlaneRow = BlendPlaneRow_RVV;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
|
BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width);
|
||||||
@ -2967,6 +2977,11 @@ int I420Blend(const uint8_t* src_y0,
|
|||||||
BlendPlaneRow = BlendPlaneRow_AVX2;
|
BlendPlaneRow = BlendPlaneRow_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_BLENDPLANEROW_RVV)
|
||||||
|
if (TestCpuFlag(kCpuHasRVV)) {
|
||||||
|
BlendPlaneRow = BlendPlaneRow_RVV;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
if (!IS_ALIGNED(width, 2)) {
|
if (!IS_ALIGNED(width, 2)) {
|
||||||
ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
|
ScaleRowDown2 = ScaleRowDown2Box_Odd_C;
|
||||||
@ -3004,6 +3019,11 @@ int I420Blend(const uint8_t* src_y0,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_SCALEROWDOWN2_RVV)
|
||||||
|
if (TestCpuFlag(kCpuHasRVV)) {
|
||||||
|
ScaleRowDown2 = ScaleRowDown2Box_RVV;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// Row buffer for intermediate alpha pixels.
|
// Row buffer for intermediate alpha pixels.
|
||||||
align_buffer_64(halfalpha, halfwidth);
|
align_buffer_64(halfalpha, halfwidth);
|
||||||
|
|||||||
@ -1070,6 +1070,81 @@ void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
|||||||
RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
|
RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Blend src_argb over src_argb1 and store to dst_argb.
|
||||||
|
// dst_argb may be src_argb or src_argb1.
|
||||||
|
// src_argb: RGB values have already been pre-multiplied by the a.
|
||||||
|
void ARGBBlendRow_RVV(const uint8_t* src_argb,
|
||||||
|
const uint8_t* src_argb1,
|
||||||
|
uint8_t* dst_argb,
|
||||||
|
int width) {
|
||||||
|
size_t w = (size_t)width;
|
||||||
|
size_t vl = __riscv_vsetvlmax_e8m2();
|
||||||
|
// clamp255((((256 - a) * b) >> 8) + f)
|
||||||
|
// = b * (256 - a) / 256 + f
|
||||||
|
// = b - (b * a / 256) + f
|
||||||
|
vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
|
||||||
|
do {
|
||||||
|
vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
|
||||||
|
vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
|
||||||
|
vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
|
||||||
|
vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
|
||||||
|
vl = __riscv_vsetvl_e8m2(w);
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
|
||||||
|
src_argb, vl);
|
||||||
|
__riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
|
||||||
|
src_argb1, vl);
|
||||||
|
|
||||||
|
v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
|
||||||
|
v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
|
||||||
|
v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
|
||||||
|
|
||||||
|
v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
|
||||||
|
v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
|
||||||
|
v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
|
||||||
|
|
||||||
|
v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
|
||||||
|
v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
|
||||||
|
v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
|
||||||
|
__riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
|
||||||
|
|
||||||
|
w -= vl;
|
||||||
|
src_argb += 4 * vl;
|
||||||
|
src_argb1 += 4 * vl;
|
||||||
|
dst_argb += 4 * vl;
|
||||||
|
} while (w > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BlendPlaneRow_RVV(const uint8_t* src0,
|
||||||
|
const uint8_t* src1,
|
||||||
|
const uint8_t* alpha,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width) {
|
||||||
|
size_t w = (size_t)width;
|
||||||
|
do {
|
||||||
|
vuint16m8_t v_dst_u16;
|
||||||
|
vuint8m4_t v_dst;
|
||||||
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
||||||
|
vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
|
||||||
|
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
|
||||||
|
vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
|
||||||
|
vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
|
||||||
|
|
||||||
|
// (a * foreground) + (1-a) * background
|
||||||
|
v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
|
||||||
|
v_dst_u16 =
|
||||||
|
__riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
|
||||||
|
v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
|
||||||
|
v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
|
||||||
|
|
||||||
|
__riscv_vse8_v_u8m4(dst, v_dst, vl);
|
||||||
|
w -= vl;
|
||||||
|
src0 += vl;
|
||||||
|
src1 += vl;
|
||||||
|
alpha += vl;
|
||||||
|
dst += vl;
|
||||||
|
} while (w > 0);
|
||||||
|
}
|
||||||
|
|
||||||
// Attenuate: (f * a + 255) >> 8
|
// Attenuate: (f * a + 255) >> 8
|
||||||
void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
|
void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user