diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2f9c792ef..33a304e53 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -841,8 +841,15 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) +#if __riscv_v_intrinsic > 11000 +// Since v0.12, TUPLE_TYPE is introduced for segment load and store. +#define LIBYUV_RVV_HAS_TUPLE_TYPE +// Since v0.12, VXRM(fixed-point rounding mode) is included in arguments of +// fixed-point intrinsics. +#define LIBYUV_RVV_HAS_VXRM_ARG +#endif + #define HAS_COPYROW_RVV -#if __riscv_v_intrinsic == 11000 #define HAS_AB64TOARGBROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_ABGRTOYROW_RVV @@ -900,7 +907,6 @@ extern "C" { #define HAS_SPLITUVROW_RVV #define HAS_SPLITXRGBROW_RVV #endif -#endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(VISUALC_HAS_AVX2) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 02ed61ca7..cc3b8f6b5 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -184,7 +184,6 @@ extern "C" { // #define HAS_SCALEARGBROWDOWNEVEN_RVV #define HAS_SCALEUVROWDOWN4_RVV #define HAS_SCALEUVROWDOWNEVEN_RVV -#if __riscv_v_intrinsic == 11000 #define HAS_SCALEARGBROWDOWN2_RVV #define HAS_SCALEARGBROWDOWN2BOX_RVV #define HAS_SCALEARGBROWDOWN2LINEAR_RVV @@ -208,7 +207,6 @@ extern "C" { #define HAS_SCALEUVROWUP2_BILINEAR_RVV #define HAS_SCALEUVROWUP2_LINEAR_RVV #endif -#endif // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, diff --git a/riscv_script/riscv-clang.cmake b/riscv_script/riscv-clang.cmake index e287941f7..35888aefa 100644 --- a/riscv_script/riscv-clang.cmake +++ b/riscv_script/riscv-clang.cmake @@ -43,6 +43,7 @@ if(RISCV_COMPILER_FLAGS STREQUAL "") list(APPEND RISCV_COMPILER_FLAGS "-march=rv64gc") endif() endif() +add_compile_options("-Wuninitialized") message(STATUS "RISCV_COMPILER_FLAGS: ${RISCV_COMPILER_FLAGS}") set(CMAKE_C_FLAGS "${RISCV_COMPILER_FLAGS} ${CMAKE_C_FLAGS}") diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 0bf2bef63..39a5c0dbc 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -28,6 +28,20 @@ namespace libyuv { extern "C" { #endif +#ifdef LIBYUV_RVV_HAS_VXRM_ARG +// Fill YUV -> RGB conversion constants into vectors +#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ + { \ + ub = yuvconst->kUVCoeff[0]; \ + vr = yuvconst->kUVCoeff[1]; \ + ug = yuvconst->kUVCoeff[2]; \ + vg = yuvconst->kUVCoeff[3]; \ + yg = yuvconst->kRGBCoeffBias[0]; \ + bb = yuvconst->kRGBCoeffBias[1] + 32; \ + bg = yuvconst->kRGBCoeffBias[2] - 32; \ + br = yuvconst->kRGBCoeffBias[3] + 32; \ + } +#else // Fill YUV -> RGB conversion constants into vectors // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). @@ -43,7 +57,7 @@ extern "C" { bg = yuvconst->kRGBCoeffBias[2] - 32; \ br = yuvconst->kRGBCoeffBias[3] + 32; \ } - +#endif // Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422 #define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \ { \ @@ -95,6 +109,15 @@ extern "C" { v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \ } +#ifdef LIBYUV_RVV_HAS_VXRM_ARG +// Convert from fixed point RGB To 8 bit RGB +#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ + { \ + v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, __RISCV_VXRM_RNU, vl); \ + v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, __RISCV_VXRM_RNU, vl); \ + v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, __RISCV_VXRM_RNU, vl); \ + } +#else // Convert from fixed point RGB To 8 bit RGB #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \ { \ @@ -102,7 +125,53 @@ extern "C" { v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \ v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \ } +#endif +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv +#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ + { \ + vuint8m1x2_t v_tmp; \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_uv, vl); \ + v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \ + v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } + +// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu +#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \ + { \ + vuint8m1x2_t v_tmp; \ + vuint8m1_t v_tmp0, v_tmp1; \ + vuint8m2_t v_y; \ + vuint16m2_t v_u_16, v_v_16; \ + vl = __riscv_vsetvl_e8m1((w + 1) / 2); \ + v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_vu, vl); \ + v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \ + v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \ + v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \ + v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \ + v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \ + v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \ + v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \ + v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \ + vl = __riscv_vsetvl_e8m2(w); \ + v_y = __riscv_vle8_v_u8m2(src_y, vl); \ + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ + } +#else // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \ { \ @@ -140,6 +209,7 @@ extern "C" { v_y = __riscv_vle8_v_u8m2(src_y, vl); \ v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \ } +#endif #ifdef HAS_ARGBTOAR64ROW_RVV void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -160,6 +230,34 @@ void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { #endif #ifdef HAS_ARGBTOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + size_t avl = (size_t)width; + do { + vuint16m2x4_t v_dst_ab64; + vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16; + size_t vl = __riscv_vsetvl_e8m1(avl); + vuint8m1x4_t v_src_argb = __riscv_vlseg4e8_v_u8m1x4(src_argb, vl); + vuint8m1_t v_b = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 0); + vuint8m1_t v_g = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 1); + vuint8m1_t v_r = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 2); + vuint8m1_t v_a = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 3); + v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl); + v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl); + v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl); + v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl); + v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl); + v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl); + v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl); + v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl); + v_dst_ab64 = __riscv_vcreate_v_u16m2x4(v_r_16, v_g_16, v_b_16, v_a_16); + __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_ab64, vl); + avl -= vl; + src_argb += 4 * vl; + dst_ab64 += 4 * vl; + } while (avl > 0); +} +#else void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { size_t avl = (size_t)width; do { @@ -182,6 +280,7 @@ void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { } while (avl > 0); } #endif +#endif #ifdef HAS_AR64TOARGBROW_RVV void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { @@ -201,6 +300,26 @@ void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { #endif #ifdef HAS_AR64TOAB64ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void AR64ToAB64Row_RVV(const uint16_t* src_ar64, + uint16_t* dst_ab64, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e16m2(w); + vuint16m2x4_t v_argb16 = __riscv_vlseg4e16_v_u16m2x4(src_ar64, vl); + vuint16m2_t v_b = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 0); + vuint16m2_t v_g = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 1); + vuint16m2_t v_r = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 2); + vuint16m2_t v_a = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 3); + vuint16m2x4_t v_dst_abgr = __riscv_vcreate_v_u16m2x4(v_r, v_g, v_b, v_a); + __riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_abgr, vl); + w -= vl; + src_ar64 += vl * 4; + dst_ab64 += vl * 4; + } while (w > 0); +} +#else void AR64ToAB64Row_RVV(const uint16_t* src_ar64, uint16_t* dst_ab64, int width) { @@ -216,8 +335,31 @@ void AR64ToAB64Row_RVV(const uint16_t* src_ar64, } while (w > 0); } #endif +#endif #ifdef HAS_AB64TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + size_t avl = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e16m2(avl); + vuint16m2x4_t v_abgr16 = __riscv_vlseg4e16_v_u16m2x4(src_ab64, vl); + vuint16m2_t v_r_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 0); + vuint16m2_t v_g_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 1); + vuint16m2_t v_b_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 2); + vuint16m2_t v_a_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 3); + vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl); + vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl); + vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl); + vuint8m1_t v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl); + vuint8m1x4_t v_dst_argb = __riscv_vcreate_v_u8m1x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m1x4(dst_argb, v_dst_argb, vl); + avl -= vl; + src_ab64 += 4 * vl; + dst_argb += 4 * vl; + } while (avl > 0); +} +#else void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { size_t avl = (size_t)width; do { @@ -236,8 +378,28 @@ void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { } while (avl > 0); } #endif +#endif #ifdef HAS_RAWTOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); + vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_raw += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#else void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -253,8 +415,28 @@ void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_RAWTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); + vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); + __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); + w -= vl; + src_raw += vl * 3; + dst_rgba += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#else void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -270,8 +452,26 @@ void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_RAWTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2); + vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_raw += vl * 3; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { size_t w = (size_t)width; do { @@ -285,8 +485,27 @@ void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTORAWROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b); + __riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl); + w -= vl; + src_argb += vl * 4; + dst_raw += vl * 3; + } while (w > 0); +} +#else void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { size_t w = (size_t)width; do { @@ -300,8 +519,28 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToRGB24Row_RVV(const uint8_t* src_argb, + uint8_t* dst_rgb24, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_argb += vl * 4; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void ARGBToRGB24Row_RVV(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { @@ -317,8 +556,27 @@ void ARGBToRGB24Row_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTOABGRROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + vuint8m2x4_t v_dst_abgr = __riscv_vcreate_v_u8m2x4(v_r, v_g, v_b, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_abgr, v_dst_abgr, vl); + w -= vl; + src_argb += vl * 4; + dst_abgr += vl * 4; + } while (w > 0); +} +#else void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { size_t w = (size_t)width; do { @@ -332,8 +590,27 @@ void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTOBGRAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + vuint8m2x4_t v_dst_bgra = __riscv_vcreate_v_u8m2x4(v_a, v_r, v_g, v_b); + __riscv_vsseg4e8_v_u8m2x4(dst_bgra, v_dst_bgra, vl); + w -= vl; + src_argb += vl * 4; + dst_bgra += vl * 4; + } while (w > 0); +} +#else void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { size_t w = (size_t)width; do { @@ -347,8 +624,27 @@ void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); + __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); + w -= vl; + src_argb += vl * 4; + dst_rgba += vl * 4; + } while (w > 0); +} +#else void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { size_t w = (size_t)width; do { @@ -362,8 +658,27 @@ void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_RGBATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3); + vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_rgba += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#else void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { size_t w = (size_t)width; do { @@ -377,8 +692,30 @@ void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_RGB24TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb24, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2); + vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_rgb24 += vl * 3; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#else void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -396,8 +733,41 @@ void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24, } while (w > 0); } #endif +#endif #ifdef HAS_I444TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I444ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + vuint8m2x4_t v_dst_argb; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void I444ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -427,8 +797,43 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I444ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + vuint8m2x4_t v_dst_argb; + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void I444AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -460,8 +865,40 @@ void I444AlphaToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I444TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I444ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + vuint8m2x3_t v_dst_rgb; + READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_y += vl; + src_u += vl; + src_v += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -490,8 +927,41 @@ void I444ToRGB24Row_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I422TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I422ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + vuint8m2x4_t v_dst_argb; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#else void I422ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -521,8 +991,43 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I422ALPHATOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + vuint8m2x4_t v_dst_argb; + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m2(src_a, vl); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} +#else void I422AlphaToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -554,8 +1059,41 @@ void I422AlphaToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I422TORGBAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I422ToRGBARow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgba, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + vuint8m2x4_t v_dst_rgba; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r); + __riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgba += vl * 4; + } while (w > 0); +} +#else void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -585,8 +1123,40 @@ void I422ToRGBARow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I422TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void I422ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + vuint8m2x3_t v_dst_rgb; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_y += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void I422ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -615,8 +1185,48 @@ void I422ToRGB24Row_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_I400TOARGBROW_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void I400ToARGBRow_RVV(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); + vuint8m2x4_t v_dst_argb; + vuint16m4_t v_yb; + if (is_yb_positive) { + v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); + } else { + v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl); + } + do { + vuint8m2_t v_y, v_out; + vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2; + vl = __riscv_vsetvl_e8m2(w); + v_y = __riscv_vle8_v_u8m2(src_y, vl); + v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); + v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y + v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl); + if (is_yb_positive) { + v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl); + } else { + v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl); + } + v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, __RISCV_VXRM_RNU, vl); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_out, v_out, v_out, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void I400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -656,8 +1266,25 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_J400TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_y = __riscv_vle8_v_u8m2(src_y, vl); + vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_y, v_y, v_y, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#else void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { size_t w = (size_t)width; size_t vl = __riscv_vsetvl_e8m2(w); @@ -673,6 +1300,7 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { } while (w > 0); } #endif +#endif #ifdef HAS_COPYROW_RVV void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { @@ -689,6 +1317,36 @@ void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { #endif #ifdef HAS_NV12TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void NV12ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + vuint8m2x4_t v_dst_argb; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void NV12ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, @@ -716,8 +1374,38 @@ void NV12ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_NV12TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void NV12ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_uv, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint8m2x3_t v_dst_rgb; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_y += vl; + src_uv += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void NV12ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -744,8 +1432,39 @@ void NV12ToRGB24Row_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_NV21TOARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void NV21ToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r, v_a; + vuint8m2x4_t v_dst_argb; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void NV21ToARGBRow_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, @@ -773,8 +1492,38 @@ void NV21ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif #ifdef HAS_NV21TORGB24ROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void NV21ToRGB24Row_RVV(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + uint8_t ub, vr, ug, vg; + int16_t yg, bb, bg, br; + vuint8m2_t v_u, v_v; + vuint8m2_t v_b, v_g, v_r; + vuint8m2x3_t v_dst_rgb; + vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br); + do { + READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16); + YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, + v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl); + w -= vl; + src_y += vl; + src_vu += vl; + dst_rgb24 += vl * 3; + } while (w > 0); +} +#else void NV21ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -801,10 +1550,65 @@ void NV21ToRGB24Row_RVV(const uint8_t* src_y, } while (w > 0); } #endif +#endif // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 - #ifdef HAS_INTERPOLATEROW_RVV +#ifdef LIBYUV_RVV_HAS_VXRM_ARG +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + size_t dst_w = (size_t)dst_width; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + // Blend 100 / 0 - Copy row unchanged. + if (y1_fraction == 0) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); + dst_w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // Blend 50 / 50. + if (y1_fraction == 128) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); + vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); + vuint8m8_t row_out = + __riscv_vaaddu_vv_u8m8(row0, row1, __RISCV_VXRM_RNU, vl); + __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // General purpose row blend. + do { + size_t vl = __riscv_vsetvl_e8m4(dst_w); + vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); + vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); + acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + __riscv_vse8_v_u8m4( + dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, __RISCV_VXRM_RNU, vl), vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); +} +#else void InterpolateRow_RVV(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, @@ -862,8 +1666,33 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, } while (dst_w > 0); } #endif +#endif #ifdef HAS_SPLITRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void SplitRGBRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x3_t v_src = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src, 1); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src, 2); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_rgb += vl * 3; + } while (w > 0); +} +#else void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -885,8 +1714,31 @@ void SplitRGBRow_RVV(const uint8_t* src_rgb, } while (w > 0); } #endif +#endif #ifdef HAS_MERGERGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void MergeRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_rgb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b); + __riscv_vsseg3e8_v_u8m2x3(dst_rgb, v_dst, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_rgb += vl * 3; + } while (w > 0); +} +#else void MergeRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -907,8 +1759,37 @@ void MergeRGBRow_RVV(const uint8_t* src_r, } while (w > 0); } #endif +#endif #ifdef HAS_SPLITARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void SplitARGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src, 3); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_a += vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#else void SplitARGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -933,8 +1814,34 @@ void SplitARGBRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_MERGEARGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void MergeARGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + const uint8_t* src_a, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl); + vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + src_a += vl; + dst_argb += vl * 4; + } while (w > 0); +} +#else void MergeARGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -958,8 +1865,33 @@ void MergeARGBRow_RVV(const uint8_t* src_r, } while (w > 0); } #endif +#endif #ifdef HAS_SPLITXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void SplitXRGBRow_RVV(const uint8_t* src_argb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2); + __riscv_vse8_v_u8m2(dst_r, v_r, vl); + __riscv_vse8_v_u8m2(dst_g, v_g, vl); + __riscv_vse8_v_u8m2(dst_b, v_b, vl); + w -= vl; + dst_r += vl; + dst_g += vl; + dst_b += vl; + src_argb += vl * 4; + } while (w > 0); +} +#else void SplitXRGBRow_RVV(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, @@ -981,8 +1913,33 @@ void SplitXRGBRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_MERGEXRGBROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void MergeXRGBRow_RVV(const uint8_t* src_r, + const uint8_t* src_g, + const uint8_t* src_b, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl); + do { + vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl); + vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl); + vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl); + vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl); + w -= vl; + src_r += vl; + src_g += vl; + src_b += vl; + dst_argb += vl * 4; + vl = __riscv_vsetvl_e8m2(w); + } while (w > 0); +} +#else void MergeXRGBRow_RVV(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, @@ -1006,8 +1963,29 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, } while (w > 0); } #endif +#endif #ifdef HAS_SPLITUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_uv, vl); + vuint8m4_t v_u = __riscv_vget_v_u8m4x2_u8m4(v_src, 0); + vuint8m4_t v_v = __riscv_vget_v_u8m4x2_u8m4(v_src, 1); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} +#else void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -1026,8 +2004,28 @@ void SplitUVRow_RVV(const uint8_t* src_uv, } while (w > 0); } #endif +#endif #ifdef HAS_MERGEUVROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u = __riscv_vle8_v_u8m4(src_u, vl); + vuint8m4_t v_v = __riscv_vle8_v_u8m4(src_v, vl); + vuint8m4x2_t v_dst = __riscv_vcreate_v_u8m4x2(v_u, v_v); + __riscv_vsseg2e8_v_u8m4x2(dst_uv, v_dst, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} +#else void MergeUVRow_RVV(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, @@ -1046,6 +2044,7 @@ void MergeUVRow_RVV(const uint8_t* src_u, } while (w > 0); } #endif +#endif struct RgbConstants { uint8_t kRGBToY[4]; @@ -1080,6 +2079,41 @@ static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, // ARGB expects first 3 values to contain RGB and 4th value is ignored #ifdef HAS_ARGBTOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_argb += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#else void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -1110,6 +2144,7 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_ARGBTOYROW_RVV void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -1137,6 +2172,41 @@ void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { // RGBA expects first value to be A and ignored, then 3 values to contain RGB. #ifdef HAS_RGBATOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgba += 4 * vl; + dst_y += vl; + } while (w > 0); +} +#else void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width, @@ -1167,6 +2237,7 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba, } while (w > 0); } #endif +#endif #ifdef HAS_RGBATOYROW_RVV void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -1187,6 +2258,40 @@ void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) { #endif #ifdef HAS_RGBTOYMATRIXROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct RgbConstants* rgbconstants) { + assert(width != 0); + size_t w = (size_t)width; + vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant + vuint16m4_t v_addy; // vector is to store kAddY + size_t vl = __riscv_vsetvl_e8m2(w); + v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl); + v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl); + v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl); + v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl); + do { + vuint8m2_t v_y; + vuint16m4_t v_y_u16; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2); + v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl); + v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl); + v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl); + v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl); + __riscv_vse8_v_u8m2(dst_y, v_y, vl); + w -= vl; + src_rgb += 3 * vl; + dst_y += vl; + } while (w > 0); +} +#else void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, uint8_t* dst_y, int width, @@ -1217,6 +2322,7 @@ void RGBToYMatrixRow_RVV(const uint8_t* src_rgb, } while (w > 0); } #endif +#endif #ifdef HAS_RGB24TOYJROW_RVV void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { @@ -1246,6 +2352,54 @@ void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { // dst_argb may be src_argb or src_argb1. // src_argb: RGB values have already been pre-multiplied by the a. #ifdef HAS_ARGBBLENDROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBBlendRow_RVV(const uint8_t* src_argb, + const uint8_t* src_argb1, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + size_t vl = __riscv_vsetvlmax_e8m2(); + // clamp255((((256 - a) * b) >> 8) + f) + // = b * (256 - a) / 256 + f + // = b - (b * a / 256) + f + vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl); + do { + vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r; + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + vuint8m2x4_t v_dst_argb; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src0_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_src0_b = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 0); + vuint8m2_t v_src0_g = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 1); + vuint8m2_t v_src0_r = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 2); + vuint8m2_t v_src0_a = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 3); + vuint8m2x4_t v_src1_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb1, vl); + vuint8m2_t v_src1_b = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 0); + vuint8m2_t v_src1_g = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 1); + vuint8m2_t v_src1_r = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 2); + + v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl); + v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl); + v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl); + + v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl); + v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl); + v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl); + + v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl); + v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl); + v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl); + + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_dst_b, v_dst_g, v_dst_r, v_255); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + + w -= vl; + src_argb += 4 * vl; + src_argb1 += 4 * vl; + dst_argb += 4 * vl; + } while (w > 0); +} +#else void ARGBBlendRow_RVV(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, @@ -1287,6 +2441,7 @@ void ARGBBlendRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_BLENDPLANEROW_RVV void BlendPlaneRow_RVV(const uint8_t* src0, @@ -1323,6 +2478,41 @@ void BlendPlaneRow_RVV(const uint8_t* src0, // Attenuate: (f * a + 255) >> 8 #ifdef HAS_ARGBATTENUATEROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + do { + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + vuint8m2x4_t v_dst_argb; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0); + vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1); + vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + // f * a + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + // f * a + 255 + v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl); + v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl); + v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl); + // (f * a + 255) >> 8 + v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl); + + v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a); + __riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} +#else void ARGBAttenuateRow_RVV(const uint8_t* src_argb, uint8_t* dst_argb, int width) { @@ -1351,8 +2541,25 @@ void ARGBAttenuateRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, + uint8_t* dst_a, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl); + vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3); + __riscv_vse8_v_u8m2(dst_a, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_a += vl; + } while (w > 0); +} +#else void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, uint8_t* dst_a, int width) { @@ -1368,6 +2575,7 @@ void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) { diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index de037e45c..5a6f6e5fc 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -67,6 +67,30 @@ void ScaleARGBRowDown2_RVV(const uint8_t* src_argb, #endif #ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + (void)src_stride; + size_t w = (size_t)dst_width; + const uint32_t* src = (const uint32_t*)(src_argb); + do { + size_t vl = __riscv_vsetvl_e32m4(w); + vuint32m4x2_t v_src = __riscv_vlseg2e32_v_u32m4x2(src, vl); + vuint32m4_t v_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 0); + vuint32m4_t v_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src, 1); + vuint8m4_t v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32); + vuint8m4_t v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32); + vuint8m4_t v_dst = + __riscv_vaaddu_vv_u8m4(v_even, v_odd, __RISCV_VXRM_RNU, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src += vl * 2; + dst_argb += vl * 4; + } while (w > 0); +} +#else void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -93,8 +117,45 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEARGBROWDOWN2BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src0 = (const uint32_t*)(src_argb); + const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); + do { + size_t vl = __riscv_vsetvl_e32m4(w); + vuint32m4x2_t v_src0 = __riscv_vlseg2e32_v_u32m4x2(src0, vl); + vuint32m4x2_t v_src1 = __riscv_vlseg2e32_v_u32m4x2(src1, vl); + vuint32m4_t v_row0_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0); + vuint32m4_t v_row0_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1); + vuint32m4_t v_row1_even_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0); + vuint32m4_t v_row1_odd_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1); + vuint8m4_t v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32); + vuint8m4_t v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32); + vuint8m4_t v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32); + vuint8m4_t v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32); + vuint16m8_t v_row0_sum = + __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4); + vuint16m8_t v_row1_sum = + __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4); + vuint16m8_t v_dst_16 = + __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); + vuint8m4_t v_dst = + __riscv_vnclipu_wx_u8m4(v_dst_16, 2, __RISCV_VXRM_RNU, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src0 += vl * 2; + src1 += vl * 2; + dst_argb += vl * 4; + } while (w > 0); +} +#else void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, @@ -129,6 +190,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEARGBROWDOWNEVEN_RVV void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb, @@ -152,6 +214,43 @@ void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb, #endif #ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + size_t w = (size_t)dst_width; + const uint32_t* src0 = (const uint32_t*)(src_argb); + const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); + const int stride_byte = src_stepx * 4; + do { + size_t vl = __riscv_vsetvl_e32m4(w); + vuint32m4x2_t v_src0 = __riscv_vlsseg2e32_v_u32m4x2(src0, stride_byte, vl); + vuint32m4x2_t v_src1 = __riscv_vlsseg2e32_v_u32m4x2(src1, stride_byte, vl); + vuint32m4_t v_row0_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 0); + vuint32m4_t v_row0_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src0, 1); + vuint32m4_t v_row1_low_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 0); + vuint32m4_t v_row1_high_32 = __riscv_vget_v_u32m4x2_u32m4(v_src1, 1); + vuint8m4_t v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32); + vuint8m4_t v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32); + vuint8m4_t v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32); + vuint8m4_t v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32); + vuint16m8_t v_row0_sum = + __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4); + vuint16m8_t v_row1_sum = + __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4); + vuint16m8_t v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4); + vuint8m4_t v_dst = + __riscv_vnclipu_wx_u8m4(v_sum, 2, __RISCV_VXRM_RNU, vl * 4); + __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4); + w -= vl; + src0 += vl * src_stepx; + src1 += vl * src_stepx; + dst_argb += vl * 4; + } while (w > 0); +} +#else void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -190,6 +289,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN2_RVV void ScaleRowDown2_RVV(const uint8_t* src_ptr, @@ -212,6 +312,26 @@ void ScaleRowDown2_RVV(const uint8_t* src_ptr, #endif #ifdef HAS_SCALEROWDOWN2LINEAR_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + size_t w = (size_t)dst_width; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_ptr, vl); + vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_src, 0); + vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_src, 1); + vuint8m4_t v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, __RISCV_VXRM_RNU, vl); + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + src_ptr += 2 * vl; + dst += vl; + } while (w > 0); +} +#else void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -234,8 +354,38 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN2BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + size_t w = (size_t)dst_width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4x2_t v_s = __riscv_vlseg2e8_v_u8m4x2(s, vl); + vuint8m4x2_t v_t = __riscv_vlseg2e8_v_u8m4x2(t, vl); + vuint8m4_t v_s0 = __riscv_vget_v_u8m4x2_u8m4(v_s, 0); + vuint8m4_t v_s1 = __riscv_vget_v_u8m4x2_u8m4(v_s, 1); + vuint8m4_t v_t0 = __riscv_vget_v_u8m4x2_u8m4(v_t, 0); + vuint8m4_t v_t1 = __riscv_vget_v_u8m4x2_u8m4(v_t, 1); + vuint16m8_t v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl); + vuint16m8_t v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl); + vuint16m8_t v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl); + // Use round-to-nearest-up mode for vnclip + vuint8m4_t v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, __RISCV_VXRM_RNU, vl); + __riscv_vse8_v_u8m4(dst, v_dst, vl); + w -= vl; + s += 2 * vl; + t += 2 * vl; + dst += vl; + } while (w > 0); +} +#else void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, @@ -266,8 +416,27 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN4_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowDown4_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); + vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); + __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl); + w -= vl; + src_ptr += (4 * vl); + dst_ptr += vl; + } while (w > 0); +} +#else void ScaleRowDown4_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -285,8 +454,70 @@ void ScaleRowDown4_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN4BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + const uint8_t* src_ptr2 = src_ptr + src_stride * 2; + const uint8_t* src_ptr3 = src_ptr + src_stride * 3; + size_t w = (size_t)dst_width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); + vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); + vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); + vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); + vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); + vuint16m4_t v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl); + vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_ptr1, vl); + vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); + vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); + vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); + vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); + vuint16m4_t v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl); + vuint8m2x4_t v_u = __riscv_vlseg4e8_v_u8m2x4(src_ptr2, vl); + vuint8m2_t v_u0 = __riscv_vget_v_u8m2x4_u8m2(v_u, 0); + vuint8m2_t v_u1 = __riscv_vget_v_u8m2x4_u8m2(v_u, 1); + vuint8m2_t v_u2 = __riscv_vget_v_u8m2x4_u8m2(v_u, 2); + vuint8m2_t v_u3 = __riscv_vget_v_u8m2x4_u8m2(v_u, 3); + vuint16m4_t v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl); + vuint16m4_t v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl); + vuint16m4_t v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl); + vuint16m4_t v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl); + vuint16m4_t v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl); + vuint16m4_t v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl); + vuint8m2x4_t v_v = __riscv_vlseg4e8_v_u8m2x4(src_ptr3, vl); + vuint8m2_t v_v0 = __riscv_vget_v_u8m2x4_u8m2(v_v, 0); + vuint8m2_t v_v1 = __riscv_vget_v_u8m2x4_u8m2(v_v, 1); + vuint8m2_t v_v2 = __riscv_vget_v_u8m2x4_u8m2(v_v, 2); + vuint8m2_t v_v3 = __riscv_vget_v_u8m2x4_u8m2(v_v, 3); + + vuint16m4_t v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl); + vuint16m4_t v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl); + + vuint16m4_t v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl); + vuint16m4_t v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl); + + vuint16m4_t v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl); + vuint16m4_t v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl); + vuint16m4_t v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl); + vuint8m2_t v_dst = + __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, __RISCV_VXRM_RNU, vl); + __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 4 * vl; + src_ptr1 += 4 * vl; + src_ptr2 += 4 * vl; + src_ptr3 += 4 * vl; + dst_ptr += vl; + } while (w > 0); +} +#else void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -348,8 +579,29 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN34_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowDown34_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_ptr, vl); + vuint8m2_t v_0 = __riscv_vget_v_u8m2x4_u8m2(v_src, 0); + vuint8m2_t v_1 = __riscv_vget_v_u8m2x4_u8m2(v_src, 1); + vuint8m2_t v_3 = __riscv_vget_v_u8m2x4_u8m2(v_src, 3); + vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_0, v_1, v_3); + __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown34_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -366,8 +618,77 @@ void ScaleRowDown34_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN34_0_BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + do { + vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; + vuint8m2_t v_u0, v_u1, v_u2, v_u3; + vuint16m4_t v_u1_u16; + vuint8m2_t v_a0, v_a1, v_a2; + vuint8m2x3_t v_dst; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl); + vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); + vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); + vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); + vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); + + if (src_stride == 0) { + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl); + v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl); + } else { + vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl); + vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); + vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); + vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); + vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl); + v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl); + v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl); + t += 4 * vl; + } + + v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl); + v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl); + v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl); + v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl); + + v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, __RISCV_VXRM_RNU, vl); + v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, __RISCV_VXRM_RNU, vl); + v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, __RISCV_VXRM_RNU, vl); + v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, __RISCV_VXRM_RNU, vl); + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl); + v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, __RISCV_VXRM_RNU, vl); + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl); + v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); + + v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2); + __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); + + w -= vl; + s += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -434,8 +755,69 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN34_1_BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + do { + vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; + vuint16m4_t v_u1_u16; + vuint8m2_t v_a0, v_a1, v_a2; + vuint8m2x3_t v_dst; + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(s, vl); + vuint8m2_t v_s0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); + vuint8m2_t v_s1 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); + vuint8m2_t v_s2 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); + vuint8m2_t v_s3 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); + + // Use round-to-nearest-up mode for vnclip & averaging add + if (src_stride == 0) { + v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, __RISCV_VXRM_RNU, vl); + v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, __RISCV_VXRM_RNU, vl); + v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, __RISCV_VXRM_RNU, vl); + v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, __RISCV_VXRM_RNU, vl); + } else { + vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(t, vl); + vuint8m2_t v_t0 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); + vuint8m2_t v_t1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); + vuint8m2_t v_t2 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); + vuint8m2_t v_t3 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); + v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, __RISCV_VXRM_RNU, vl); + v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, __RISCV_VXRM_RNU, vl); + v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, __RISCV_VXRM_RNU, vl); + v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, __RISCV_VXRM_RNU, vl); + t += 4 * vl; + } + // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl); + v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); + + // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 + v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, __RISCV_VXRM_RNU, vl); + + // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 + v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl); + v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl); + v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, __RISCV_VXRM_RNU, vl); + + v_dst = __riscv_vcreate_v_u8m2x3(v_a0, v_a1, v_a2); + __riscv_vsseg3e8_v_u8m2x3(dst_ptr, v_dst, vl); + + w -= vl; + s += 4 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -490,8 +872,31 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN38_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowDown38_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + (void)src_stride; + assert(dst_width % 3 == 0); + do { + size_t vl = __riscv_vsetvl_e8m1(w); + vuint8m1x8_t v_src = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); + vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_src, 0); + vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_src, 3); + vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_src, 6); + vuint8m1x3_t v_dst = __riscv_vcreate_v_u8m1x3(v_s0, v_s3, v_s6); + __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown38_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -511,8 +916,77 @@ void ScaleRowDown38_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN38_2_BOX_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 6u); + const uint16_t coeff_b = (65536u / 4u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint16m2_t v_e, v_f, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + vuint8m1x3_t v_dst; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); + vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0); + vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1); + vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2); + vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3); + vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4); + vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5); + vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6); + vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7); + // t: e01, e11, e21, f01, f11, f21, g01, g11 + vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl); + vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0); + vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1); + vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2); + vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3); + vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4); + vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5); + vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6); + vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7); + // Calculate sum of [e00, e21] to v_e + // Calculate sum of [f00, f21] to v_f + // Calculate sum of [g00, g11] to v_g + vuint16m2_t v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + vuint16m2_t v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + vuint16m2_t v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + vuint16m2_t v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + vuint16m2_t v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + vuint16m2_t v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + vuint16m2_t v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + vuint16m2_t v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + + v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g); + __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -569,8 +1043,101 @@ void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEROWDOWN38_3_BOX_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + size_t w = (size_t)dst_width / 3u; + const uint16_t coeff_a = (65536u / 9u); + const uint16_t coeff_b = (65536u / 6u); + assert((dst_width % 3 == 0) && (dst_width > 0)); + do { + vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e; + vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f; + vuint16m2_t v_g0, v_g1, v_g2, v_g; + vuint8m1_t v_dst_e, v_dst_f, v_dst_g; + vuint8m1x3_t v_dst; + size_t vl = __riscv_vsetvl_e8m1(w); + // s: e00, e10, e20, f00, f10, f20, g00, g10 + vuint8m1x8_t v_s = __riscv_vlseg8e8_v_u8m1x8(src_ptr, vl); + vuint8m1_t v_s0 = __riscv_vget_v_u8m1x8_u8m1(v_s, 0); + vuint8m1_t v_s1 = __riscv_vget_v_u8m1x8_u8m1(v_s, 1); + vuint8m1_t v_s2 = __riscv_vget_v_u8m1x8_u8m1(v_s, 2); + vuint8m1_t v_s3 = __riscv_vget_v_u8m1x8_u8m1(v_s, 3); + vuint8m1_t v_s4 = __riscv_vget_v_u8m1x8_u8m1(v_s, 4); + vuint8m1_t v_s5 = __riscv_vget_v_u8m1x8_u8m1(v_s, 5); + vuint8m1_t v_s6 = __riscv_vget_v_u8m1x8_u8m1(v_s, 6); + vuint8m1_t v_s7 = __riscv_vget_v_u8m1x8_u8m1(v_s, 7); + // t: e01, e11, e21, f01, f11, f21, g01, g11 + vuint8m1x8_t v_t = __riscv_vlseg8e8_v_u8m1x8(src_ptr + src_stride, vl); + vuint8m1_t v_t0 = __riscv_vget_v_u8m1x8_u8m1(v_t, 0); + vuint8m1_t v_t1 = __riscv_vget_v_u8m1x8_u8m1(v_t, 1); + vuint8m1_t v_t2 = __riscv_vget_v_u8m1x8_u8m1(v_t, 2); + vuint8m1_t v_t3 = __riscv_vget_v_u8m1x8_u8m1(v_t, 3); + vuint8m1_t v_t4 = __riscv_vget_v_u8m1x8_u8m1(v_t, 4); + vuint8m1_t v_t5 = __riscv_vget_v_u8m1x8_u8m1(v_t, 5); + vuint8m1_t v_t6 = __riscv_vget_v_u8m1x8_u8m1(v_t, 6); + vuint8m1_t v_t7 = __riscv_vget_v_u8m1x8_u8m1(v_t, 7); + // u: e02, e12, e22, f02, f12, f22, g02, g12 + vuint8m1x8_t v_u = __riscv_vlseg8e8_v_u8m1x8(src_ptr + 2 * src_stride, vl); + vuint8m1_t v_u0 = __riscv_vget_v_u8m1x8_u8m1(v_u, 0); + vuint8m1_t v_u1 = __riscv_vget_v_u8m1x8_u8m1(v_u, 1); + vuint8m1_t v_u2 = __riscv_vget_v_u8m1x8_u8m1(v_u, 2); + vuint8m1_t v_u3 = __riscv_vget_v_u8m1x8_u8m1(v_u, 3); + vuint8m1_t v_u4 = __riscv_vget_v_u8m1x8_u8m1(v_u, 4); + vuint8m1_t v_u5 = __riscv_vget_v_u8m1x8_u8m1(v_u, 5); + vuint8m1_t v_u6 = __riscv_vget_v_u8m1x8_u8m1(v_u, 6); + vuint8m1_t v_u7 = __riscv_vget_v_u8m1x8_u8m1(v_u, 7); + // Calculate sum of [e00, e22] + v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl); + v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl); + v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl); + v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl); + v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl); + + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl); + v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl); + v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl); + v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl); + // Calculate sum of [f00, f22] + v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl); + v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl); + v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl); + v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl); + v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl); + + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl); + v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl); + v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl); + v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl); + // Calculate sum of [g00, g12] + v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl); + v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl); + v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl); + + v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl); + v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl); + + // Average in 16-bit fixed-point + v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl); + v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl); + v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl); + v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl); + v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl); + v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl); + + v_dst = __riscv_vcreate_v_u8m1x3(v_dst_e, v_dst_f, v_dst_g); + __riscv_vsseg3e8_v_u8m1x3(dst_ptr, v_dst, vl); + w -= vl; + src_ptr += 8 * vl; + dst_ptr += 3 * vl; + } while (w > 0); +} +#else void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -642,12 +1209,50 @@ void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr, } while (w > 0); } #endif +#endif // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms' // ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other // platforms only implement non-edge part of image and process edge with scalar. #ifdef HAS_SCALEROWUP2_LINEAR_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + size_t work_width = (size_t)dst_width - 1u; + size_t src_width = work_width >> 1u; + const uint8_t* work_src_ptr = src_ptr; + uint8_t* work_dst_ptr = dst_ptr + 1; + size_t vl = __riscv_vsetvlmax_e8m4(); + vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl); + dst_ptr[0] = src_ptr[0]; + while (src_width > 0) { + vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even; + vuint16m8_t v_src0_u16, v_src1_u16; + vuint8m4x2_t v_dst; + size_t vl = __riscv_vsetvl_e8m4(src_width); + v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); + v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl); + + v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl); + v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl); + v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl); + v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl); + + v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl); + v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl); + + v_dst = __riscv_vcreate_v_u8m4x2(v_dst_even, v_dst_odd); + __riscv_vsseg2e8_v_u8m4x2(work_dst_ptr, v_dst, vl); + + src_width -= vl; + work_src_ptr += vl; + work_dst_ptr += 2 * vl; + } + dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; +} +#else void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -682,8 +1287,82 @@ void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr, dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2]; } #endif +#endif #ifdef HAS_SCALEROWUP2_BILINEAR_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + size_t src_width = work_width >> 1u; + const uint8_t* work_s = src_ptr; + const uint8_t* work_t = src_ptr + src_stride; + const uint8_t* s = work_s; + const uint8_t* t = work_t; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + uint8_t* work_d = d + 1; + uint8_t* work_e = e + 1; + size_t vl = __riscv_vsetvlmax_e16m4(); + vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); + vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); + d[0] = (3 * s[0] + t[0] + 2) >> 2; + e[0] = (s[0] + 3 * t[0] + 2) >> 2; + while (src_width > 0) { + vuint8m2_t v_s0, v_s1, v_t0, v_t1; + vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; + vuint16m4_t v_t0_u16_, v_t1_u16_; + vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; + vuint8m2x2_t v_dst0, v_dst1; + size_t vl = __riscv_vsetvl_e8m2(src_width); + v_s0 = __riscv_vle8_v_u8m2(work_s, vl); + v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl); + + v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); + v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); + + v_t0 = __riscv_vle8_v_u8m2(work_t, vl); + v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl); + + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); + v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); + v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); + + v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); + v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); + + v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); + v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); + v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); + v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); + + v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); + v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); + v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); + v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); + + v_dst0 = __riscv_vcreate_v_u8m2x2(v_dst0_even, v_dst0_odd); + __riscv_vsseg2e8_v_u8m2x2(work_d, v_dst0, vl); + v_dst1 = __riscv_vcreate_v_u8m2x2(v_dst1_even, v_dst1_odd); + __riscv_vsseg2e8_v_u8m2x2(work_e, v_dst1, vl); + src_width -= vl; + work_s += vl; + work_t += vl; + work_d += 2 * vl; + work_e += 2 * vl; + } + d[dst_width - 1] = + (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2; + e[dst_width - 1] = + (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2; +} +#else void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -754,6 +1433,7 @@ void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr, (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2; } #endif +#endif #ifdef HAS_SCALEUVROWDOWN2_RVV void ScaleUVRowDown2_RVV(const uint8_t* src_uv, @@ -777,6 +1457,30 @@ void ScaleUVRowDown2_RVV(const uint8_t* src_uv, #endif #ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + size_t w = (size_t)dst_width; + const uint16_t* src = (const uint16_t*)src_uv; + (void)src_stride; + do { + size_t vl = __riscv_vsetvl_e16m4(w); + vuint16m4x2_t v_src = __riscv_vlseg2e16_v_u16m4x2(src, vl); + vuint16m4_t v_u0v0_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 0); + vuint16m4_t v_u1v1_16 = __riscv_vget_v_u16m4x2_u16m4(v_src, 1); + vuint8m4_t v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16); + vuint8m4_t v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16); + vuint8m4_t v_avg = + __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, __RISCV_VXRM_RNU, vl * 2); + __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2); + w -= vl; + src += vl * 2; + dst_uv += vl * 2; + } while (w > 0); +} +#else void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, @@ -803,8 +1507,50 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEUVROWDOWN2BOX_RVV +#if defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VXRM_ARG) +void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, + ptrdiff_t src_stride, + uint8_t* dst_uv, + int dst_width) { + const uint8_t* src_uv_row1 = src_uv + src_stride; + size_t w = (size_t)dst_width; + do { + size_t vl = __riscv_vsetvl_e8m2(w); + vuint8m2x4_t v_s = __riscv_vlseg4e8_v_u8m2x4(src_uv, vl); + vuint8m2_t v_u0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 0); + vuint8m2_t v_v0_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 1); + vuint8m2_t v_u1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 2); + vuint8m2_t v_v1_row0 = __riscv_vget_v_u8m2x4_u8m2(v_s, 3); + vuint8m2x4_t v_t = __riscv_vlseg4e8_v_u8m2x4(src_uv_row1, vl); + vuint8m2_t v_u0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 0); + vuint8m2_t v_v0_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 1); + vuint8m2_t v_u1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 2); + vuint8m2_t v_v1_row1 = __riscv_vget_v_u8m2x4_u8m2(v_t, 3); + + vuint16m4_t v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl); + vuint16m4_t v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl); + vuint16m4_t v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl); + vuint16m4_t v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl); + vuint16m4_t v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl); + vuint16m4_t v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl); + vuint8m2_t v_dst_u = + __riscv_vnclipu_wx_u8m2(v_sum0, 2, __RISCV_VXRM_RNU, vl); + vuint8m2_t v_dst_v = + __riscv_vnclipu_wx_u8m2(v_sum1, 2, __RISCV_VXRM_RNU, vl); + + vuint8m2x2_t v_dst_uv = __riscv_vcreate_v_u8m2x2(v_dst_u, v_dst_v); + __riscv_vsseg2e8_v_u8m2x2(dst_uv, v_dst_uv, vl); + + dst_uv += 2 * vl; + src_uv += 4 * vl; + w -= vl; + src_uv_row1 += 4 * vl; + } while (w > 0); +} +#else void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, @@ -847,6 +1593,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, } while (w > 0); } #endif +#endif #ifdef HAS_SCALEUVROWDOWN4_RVV void ScaleUVRowDown4_RVV(const uint8_t* src_uv, @@ -903,6 +1650,49 @@ void ScaleUVRowDownEven_RVV(const uint8_t* src_uv, // scalar. #ifdef HAS_SCALEUVROWUP2_LINEAR_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1; + const uint8_t* work_src_ptr = src_ptr; + size_t vl = __riscv_vsetvlmax_e8m4(); + vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl); + dst_ptr[0] = src_ptr[0]; + dst_ptr[1] = src_ptr[1]; + while (work_width > 0) { + vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8; + vuint16m4_t v_dst_odd, v_dst_even; + vuint16m8_t v_uv0_u16, v_uv1_u16; + vuint16m4x2_t v_dst; + size_t vl = __riscv_vsetvl_e8m4(work_width); + v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl); + v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl); + + v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl); + v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl); + + v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl); + v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl); + + v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl); + v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl); + + v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8); + v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8); + + v_dst = __riscv_vcreate_v_u16m4x2(v_dst_even, v_dst_odd); + __riscv_vsseg2e16_v_u16m4x2(work_dst_ptr, v_dst, vl / 2); + + work_width -= vl; + work_src_ptr += vl; + work_dst_ptr += vl; + } + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; +} +#else void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -943,8 +1733,98 @@ void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr, dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; } #endif +#endif #ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV +#ifdef LIBYUV_RVV_HAS_TUPLE_TYPE +void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + size_t work_width = ((size_t)dst_width - 1u) & ~1u; + const uint8_t* work_s = src_ptr; + const uint8_t* work_t = src_ptr + src_stride; + const uint8_t* s = work_s; + const uint8_t* t = work_t; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + uint16_t* work_d = (uint16_t*)d + 1; + uint16_t* work_e = (uint16_t*)e + 1; + size_t vl = __riscv_vsetvlmax_e16m4(); + vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl); + vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl); + d[0] = (3 * s[0] + t[0] + 2) >> 2; + e[0] = (s[0] + 3 * t[0] + 2) >> 2; + d[1] = (3 * s[1] + t[1] + 2) >> 2; + e[1] = (s[1] + 3 * t[1] + 2) >> 2; + while (work_width > 0) { + vuint8m2_t v_s0, v_s1, v_t0, v_t1; + vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16; + vuint16m4_t v_t0_u16_, v_t1_u16_; + vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8; + vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd; + vuint16m2x2_t v_dst0, v_dst1; + size_t vl = __riscv_vsetvl_e8m2(work_width); + v_s0 = __riscv_vle8_v_u8m2(work_s, vl); + v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl); + + v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl); + v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl); + v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl); + v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl); + + v_t0 = __riscv_vle8_v_u8m2(work_t, vl); + v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl); + + v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl); + v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl); + v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl); + v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl); + + v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl); + v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl); + + v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl); + v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl); + v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl); + v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl); + + v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl); + v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl); + v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl); + v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl); + + v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8); + v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8); + v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8); + v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8); + + v_dst0 = __riscv_vcreate_v_u16m2x2(v_dst0_even, v_dst0_odd); + __riscv_vsseg2e16_v_u16m2x2(work_d, v_dst0, vl / 2); + v_dst1 = __riscv_vcreate_v_u16m2x2(v_dst1_even, v_dst1_odd); + __riscv_vsseg2e16_v_u16m2x2(work_e, v_dst1, vl / 2); + + work_width -= vl; + work_s += vl; + work_t += vl; + work_d += vl; + work_e += vl; + } + d[2 * dst_width - 2] = + (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >> + 2; + e[2 * dst_width - 2] = + (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >> + 2; + d[2 * dst_width - 1] = + (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >> + 2; + e[2 * dst_width - 1] = + (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >> + 2; +} +#else void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1030,6 +1910,7 @@ void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr, 2; } #endif +#endif #ifdef __cplusplus } // extern "C"