diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 3488d2568..3590f125a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -807,6 +807,7 @@ extern "C" { #define HAS_ABGRTOYROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_BGRATOYROW_RVV +#define HAS_COPYROW_RVV #define HAS_I400TOARGBROW_RVV #define HAS_I422ALPHATOARGBROW_RVV #define HAS_I422TOARGBROW_RVV @@ -815,12 +816,15 @@ extern "C" { #define HAS_I444ALPHATOARGBROW_RVV #define HAS_I444TOARGBROW_RVV #define HAS_I444TORGB24ROW_RVV +#define HAS_INTERPOLATEROW_RVV #define HAS_J400TOARGBROW_RVV #define HAS_MERGEARGBROW_RVV #define HAS_MERGERGBROW_RVV +#define HAS_MERGEUVROW_RVV #define HAS_MERGEXRGBROW_RVV #define HAS_SPLITARGBROW_RVV #define HAS_SPLITRGBROW_RVV +#define HAS_SPLITUVROW_RVV #define HAS_SPLITXRGBROW_RVV #define HAS_RAWTOARGBROW_RVV #define HAS_RAWTORGB24ROW_RVV @@ -832,9 +836,6 @@ extern "C" { #define HAS_RGB24TOYROW_RVV #define HAS_RGBATOYROW_RVV #define HAS_RGBATOYJROW_RVV -#define HAS_SPLITARGBROW_RVV -#define HAS_SPLITRGBROW_RVV -#define HAS_SPLITXRGBROW_RVV #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -2242,6 +2243,10 @@ void SplitUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -2403,6 +2408,10 @@ void MergeUVRow_LSX(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width); void MergeUVRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3038,6 +3047,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -5858,6 +5868,11 @@ void InterpolateRow_LSX(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, diff --git a/source/convert.cc b/source/convert.cc index 075428d09..9763d2fea 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -955,6 +955,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -995,6 +1000,11 @@ int I422ToNV21(const uint8_t* src_y, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 6e05876a0..b192a3b77 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -453,6 +453,11 @@ int ARGBToNV12(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -646,6 +651,11 @@ int ARGBToNV21(const uint8_t* src_argb, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -826,6 +836,11 @@ int ABGRToNV12(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -1007,6 +1022,11 @@ int ABGRToNV21(const uint8_t* src_abgr, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a rows of uv. @@ -3203,6 +3223,11 @@ int RAWToJNV21(const uint8_t* src_raw, MergeUVRow_ = MergeUVRow_LSX; } } +#endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow_ = MergeUVRow_RVV; + } #endif { // Allocate a row of uv. diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e741dc509..d115a2a10 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -75,6 +75,11 @@ void CopyPlane(const uint8_t* src_y, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Copy plane for (y = 0; y < height; ++y) { @@ -545,6 +550,11 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -631,6 +641,11 @@ void MergeUVPlane(const uint8_t* src_u, } } #endif +#if defined(HAS_MERGEUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + MergeUVRow = MergeUVRow_RVV; + } +#endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. @@ -4348,6 +4363,11 @@ int InterpolatePlane(const uint8_t* src0, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); @@ -5560,6 +5580,12 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + SplitUVRow = SplitUVRow_RVV; + } +#endif + #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -5600,6 +5626,11 @@ int UYVYToNV12(const uint8_t* src_uyvy, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif { int awidth = halfwidth * 2; @@ -5665,6 +5696,7 @@ void HalfMergeUVPlane(const uint8_t* src_u, HalfMergeUVRow = HalfMergeUVRow_AVX2; } #endif + for (y = 0; y < height - 1; y += 2) { // Merge a row of U and V into a row of UV. HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); diff --git a/source/rotate.cc b/source/rotate.cc index 6797ff02b..8d3978c71 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -214,6 +214,11 @@ void RotatePlane180(const uint8_t* src, CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 9667f34c2..c72390108 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -192,6 +192,11 @@ static int ARGBRotate180(const uint8_t* src_argb, CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif +#if defined(HAS_COPYROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + CopyRow = CopyRow_RVV; + } +#endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { diff --git a/source/row_rvv.cc b/source/row_rvv.cc index be4c4a309..7297a401d 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -528,6 +528,75 @@ void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) { } while (w > 0); } +void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m8(w); + vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl); + __riscv_vse8_v_u8m8(dst, v_data, vl); + w -= vl; + src += vl; + dst += vl; + } while (w > 0); +} + +// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1 +void InterpolateRow_RVV(uint8_t* dst_ptr, + const uint8_t* src_ptr, + ptrdiff_t src_stride, + int dst_width, + int source_y_fraction) { + int y1_fraction = source_y_fraction; + int y0_fraction = 256 - y1_fraction; + const uint8_t* src_ptr1 = src_ptr + src_stride; + size_t dst_w = (size_t)dst_width; + assert(source_y_fraction >= 0); + assert(source_y_fraction < 256); + // Blend 100 / 0 - Copy row unchanged. + if (y1_fraction == 0) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl); + dst_w -= vl; + src_ptr += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // Blend 50 / 50. + if (y1_fraction == 128) { + do { + size_t vl = __riscv_vsetvl_e8m8(dst_w); + vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl); + vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl); + // Averaging add + vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl); + __riscv_vse8_v_u8m8(dst_ptr, row_out, vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); + return; + } + // General purpose row blend. + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + size_t vl = __riscv_vsetvl_e8m4(dst_w); + vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl); + vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl); + vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl); + acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl); + __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl); + dst_w -= vl; + src_ptr += vl; + src_ptr1 += vl; + dst_ptr += vl; + } while (dst_w > 0); +} + void SplitRGBRow_RVV(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, @@ -660,6 +729,42 @@ void MergeXRGBRow_RVV(const uint8_t* src_r, } while (w > 0); } +void SplitUVRow_RVV(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + size_t w = (size_t)width; + do { + size_t vl = __riscv_vsetvl_e8m4(w); + vuint8m4_t v_u, v_v; + __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl); + __riscv_vse8_v_u8m4(dst_u, v_u, vl); + __riscv_vse8_v_u8m4(dst_v, v_v, vl); + w -= vl; + dst_u += vl; + dst_v += vl; + src_uv += 2 * vl; + } while (w > 0); +} + +void MergeUVRow_RVV(const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_uv, + int width) { + size_t w = (size_t)width; + do { + vuint8m4_t v_u, v_v; + size_t vl = __riscv_vsetvl_e8m4(w); + v_u = __riscv_vle8_v_u8m4(src_u, vl); + v_v = __riscv_vle8_v_u8m4(src_v, vl); + __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl); + w -= vl; + src_u += vl; + src_v += vl; + dst_uv += 2 * vl; + } while (w > 0); +} + struct RgbConstants { uint8_t kRGBToY[4]; uint16_t kAddY; diff --git a/source/scale.cc b/source/scale.cc index 591a6a938..80b030dc2 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1118,6 +1118,11 @@ void ScalePlaneBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { @@ -1313,6 +1318,11 @@ void ScalePlaneBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 8d2509474..ddd8d29ed 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -348,6 +348,11 @@ static void ScaleARGBBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; @@ -467,6 +472,11 @@ static void ScaleARGBBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleARGBFilterCols = @@ -724,6 +734,11 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif void (*ScaleARGBFilterCols)(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = diff --git a/source/scale_common.cc b/source/scale_common.cc index 5e603fd40..774559032 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1678,6 +1678,12 @@ void ScalePlaneVertical(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif + for (j = 0; j < dst_height; ++j) { int yi; int yf; diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 65f986e93..1556071d0 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -397,6 +397,11 @@ static void ScaleUVBilinearDown(int src_width, } } #endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } +#endif #if defined(HAS_SCALEUVFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; @@ -510,6 +515,11 @@ static void ScaleUVBilinearUp(int src_width, InterpolateRow = InterpolateRow_LSX; } } +#endif +#if defined(HAS_INTERPOLATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + InterpolateRow = InterpolateRow_RVV; + } #endif if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C;