diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 55d4529df..c47c55d15 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -778,6 +778,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) #define HAS_AB64TOARGBROW_RVV #define HAS_AR64TOARGBROW_RVV +#define HAS_ARGBATTENUATEROW_RVV #define HAS_ARGBTOAB64ROW_RVV #define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTORAWROW_RVV @@ -787,9 +788,11 @@ extern "C" { #define HAS_ABGRTOYROW_RVV #define HAS_ABGRTOYJROW_RVV #define HAS_BGRATOYROW_RVV +#define HAS_I422ALPHATOARGBROW_RVV #define HAS_I422TOARGBROW_RVV #define HAS_I422TORGB24ROW_RVV #define HAS_I422TORGBAROW_RVV +#define HAS_I444ALPHATOARGBROW_RVV #define HAS_I444TOARGBROW_RVV #define HAS_I444TORGB24ROW_RVV #define HAS_MERGEARGBROW_RVV @@ -1081,6 +1084,13 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -1093,6 +1103,13 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -5503,6 +5520,9 @@ void ARGBAttenuateRow_MSA(const uint8_t* src_argb, void ARGBAttenuateRow_LASX(const uint8_t* src_argb, uint8_t* dst_argb, int width); +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width); void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 62884e5d4..4d953faeb 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -11,7 +11,6 @@ #include "libyuv/convert_argb.h" #include - #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG @@ -2059,6 +2058,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -2091,6 +2095,11 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2192,6 +2201,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I422ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I422AlphaToARGBRow = I422AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -2224,6 +2238,11 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2307,6 +2326,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -2339,6 +2363,11 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2567,6 +2596,11 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2668,6 +2702,11 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2767,6 +2806,11 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -6703,6 +6747,11 @@ static int I420AlphaToARGBMatrixBilinear( } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -6735,6 +6784,11 @@ static int I420AlphaToARGBMatrixBilinear( } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif #if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -6895,6 +6949,11 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_I444ALPHATOARGBROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + I444AlphaToARGBRow = I444AlphaToARGBRow_RVV; + } +#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -6927,6 +6986,12 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif + #if defined(HAS_SCALEROWUP2_LINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleRowUp2_Linear = ScaleRowUp2_Linear_Any_SSE2; @@ -7060,6 +7125,11 @@ static int I010AlphaToARGBMatrixBilinear( } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif #if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -7221,6 +7291,11 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 32424fb91..fd12718da 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -3566,6 +3566,11 @@ int ARGBAttenuate(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_RVV) + if (TestCpuFlag(kCpuHasRVV)) { + ARGBAttenuateRow = ARGBAttenuateRow_RVV; + } +#endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 99f231651..ad131924a 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -16,7 +16,6 @@ */ #include - #include "libyuv/row.h" #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) @@ -293,6 +292,38 @@ void I444ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } +void I444AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + vuint8m1_t v_u, v_v; + vuint8m1_t v_ub, v_vr, v_ug, v_vg; + vuint8m1_t v_b, v_g, v_r, v_a; + vuint16m2_t v_yg, v_bb, v_bg, v_br; + vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, + v_br); + do { + READYUV444(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m1(src_a, vl); + YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, + v_y_16, v_g_16, v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl; + src_v += vl; + dst_argb += vl * 4; + } while (w > 0); +} + void I444ToRGB24Row_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -352,6 +383,38 @@ void I422ToARGBRow_RVV(const uint8_t* src_y, } while (w > 0); } +void I422AlphaToARGBRow_RVV(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + const uint8_t* src_a, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + size_t vl; + size_t w = (size_t)width; + vuint8m1_t v_u, v_v; + vuint8m1_t v_ub, v_vr, v_ug, v_vg; + vuint8m1_t v_b, v_g, v_r, v_a; + vuint16m2_t v_yg, v_bb, v_bg, v_br; + vuint16m2_t v_y_16, v_g_16, v_b_16, v_r_16; + YUVTORGB_SETUP(yuvconstants, vl, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, + v_br); + do { + READYUV422(vl, v_u, v_v, v_y_16); + v_a = __riscv_vle8_v_u8m1(src_a, vl); + YUVTORGB(vl, v_u, v_v, v_ub, v_vr, v_ug, v_vg, v_yg, v_bb, v_bg, v_br, + v_y_16, v_g_16, v_b_16, v_r_16); + RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r); + __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_y += vl; + src_a += vl; + src_u += vl / 2; + src_v += vl / 2; + dst_argb += vl * 4; + } while (w > 0); +} + void I422ToRGBARow_RVV(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -710,6 +773,31 @@ void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) { RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants); } +void ARGBAttenuateRow_RVV(const uint8_t* src_argb, + uint8_t* dst_argb, + int width) { + size_t w = (size_t)width; + // To match behavior on other platforms, vxrm (fixed-point rounding mode + // register) is set to round-to-nearest-up(0). + asm volatile("csrwi vxrm, 0"); + do { + vuint8m2_t v_b, v_g, v_r, v_a; + vuint16m4_t v_ba_16, v_ga_16, v_ra_16; + size_t vl = __riscv_vsetvl_e8m2(w); + __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl); + v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl); + v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl); + v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl); + v_b = __riscv_vnclipu_wx_u8m2(v_ba_16, 8, vl); + v_g = __riscv_vnclipu_wx_u8m2(v_ga_16, 8, vl); + v_r = __riscv_vnclipu_wx_u8m2(v_ra_16, 8, vl); + __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl); + w -= vl; + src_argb += vl * 4; + dst_argb += vl * 4; + } while (w > 0); +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv