[RVV] Enable ARGBExtractAlphaRow/ARGBCopyYToAlphaRow

* Run on SiFive internal FPGA:

TestARGBExtractAlpha(~3.2x vs scalar)
TestARGBCopyYToAlpha(~1.6x vs scalar)

Change-Id: I36525c67e8ac3f71ea9d1a58c7dc15a4009d9da1
Signed-off-by: Bruce Lai <bruce.lai@sifive.com>
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4617955
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Bruce Lai 2023-06-15 04:56:58 -07:00 committed by libyuv LUCI CQ
parent 552571e8b2
commit 04821d1e7d
5 changed files with 75 additions and 2 deletions

View File

@ -798,6 +798,8 @@ extern "C" {
#define HAS_AB64TOARGBROW_RVV #define HAS_AB64TOARGBROW_RVV
#define HAS_AR64TOARGBROW_RVV #define HAS_AR64TOARGBROW_RVV
#define HAS_ARGBATTENUATEROW_RVV #define HAS_ARGBATTENUATEROW_RVV
#define HAS_ARGBCOPYYTOALPHAROW_RVV
#define HAS_ARGBEXTRACTALPHAROW_RVV
#define HAS_ARGBTOAB64ROW_RVV #define HAS_ARGBTOAB64ROW_RVV
#define HAS_ARGBTOAR64ROW_RVV #define HAS_ARGBTOAR64ROW_RVV
#define HAS_ARGBTORAWROW_RVV #define HAS_ARGBTORAWROW_RVV
@ -3081,6 +3083,9 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb, void ARGBExtractAlphaRow_LSX(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width); int width);
void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
uint8_t* dst_a,
int width);
void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int width); int width);
@ -3100,6 +3105,7 @@ void ARGBExtractAlphaRow_Any_LSX(const uint8_t* src_ptr,
void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width);
void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int width); int width);

View File

@ -2128,6 +2128,11 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_LSX; : ARGBExtractAlphaRow_Any_LSX;
} }
#endif #endif
#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
}
#endif
for (y = 0; y < height - 1; y += 2) { for (y = 0; y < height - 1; y += 2) {
ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);

View File

@ -5340,6 +5340,11 @@ int ARGBExtractAlpha(const uint8_t* src_argb,
: ARGBExtractAlphaRow_Any_LSX; : ARGBExtractAlphaRow_Any_LSX;
} }
#endif #endif
#if defined(HAS_ARGBEXTRACTALPHAROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBExtractAlphaRow = ARGBExtractAlphaRow_RVV;
}
#endif
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
ARGBExtractAlphaRow(src_argb, dst_a, width); ARGBExtractAlphaRow(src_argb, dst_a, width);
@ -5391,6 +5396,11 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
} }
} }
#endif #endif
#if defined(HAS_ARGBCOPYYTOALPHAROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_RVV;
}
#endif
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
ARGBCopyYToAlphaRow(src_y, dst_argb, width); ARGBCopyYToAlphaRow(src_y, dst_argb, width);

View File

@ -948,6 +948,35 @@ void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
} while (w > 0); } while (w > 0);
} }
void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
size_t w = (size_t)width;
do {
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2_t v_b, v_g, v_r, v_a;
__riscv_vlseg4e8_v_u8m2(&v_r, &v_g, &v_b, &v_a, src_argb, vl);
__riscv_vse8_v_u8m2(dst_a, v_a, vl);
w -= vl;
src_argb += vl * 4;
dst_a += vl;
} while (w > 0);
}
void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
size_t w = (size_t)width;
const ptrdiff_t dst_stride = 4;
dst += 3;
do {
size_t vl = __riscv_vsetvl_e8m8(w);
vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
__riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
w -= vl;
src += vl;
dst += vl * dst_stride;
} while (w > 0);
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -2749,12 +2749,23 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
MaskCpuFlags(disable_cpu_flags_); MaskCpuFlags(disable_cpu_flags_);
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
benchmark_width_, benchmark_width_, benchmark_height_); benchmark_width_, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); double c_time = get_time();
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c,
benchmark_width_, benchmark_width_, benchmark_height_);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info_);
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
benchmark_width_, benchmark_width_, benchmark_height_);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
benchmark_width_, benchmark_width_, benchmark_height_); benchmark_width_, benchmark_width_, benchmark_height_);
} }
opt_time = (get_time() - opt_time) / benchmark_iterations_;
// Report performance of C vs OPT
printf("%8d us C - %8d us OPT\n",
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
for (int i = 0; i < kPixels; ++i) { for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }
@ -2777,12 +2788,24 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
MaskCpuFlags(disable_cpu_flags_); MaskCpuFlags(disable_cpu_flags_);
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_); benchmark_width_ * 4, benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); double c_time = get_time();
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
c_time = (get_time() - c_time);
MaskCpuFlags(benchmark_cpu_info_);
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_);
double opt_time = get_time();
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt,
benchmark_width_ * 4, benchmark_width_, benchmark_height_); benchmark_width_ * 4, benchmark_width_, benchmark_height_);
} }
opt_time = (get_time() - opt_time) / benchmark_iterations_;
// Report performance of C vs OPT
printf("%8d us C - %8d us OPT\n",
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
for (int i = 0; i < kPixels * 4; ++i) { for (int i = 0; i < kPixels * 4; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
} }