mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
Removed all SSE functions, macros, dispatching logic, and related unit tests across the repository to reduce code size and complexity. Left cpuid detection intact. Supported architectures like AVX2, NEON, SVE, etc. are unaffected. R=rrwinterton@gmail.com Bug: None Test: Build and run libyuv_unittest Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f
1587 lines
57 KiB
C++
1587 lines
57 KiB
C++
/*
|
|
* Copyright 2023 The LibYuv Project Authors. All rights reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
/*
|
|
* Copyright (c) 2023 SiFive, Inc. All rights reserved.
|
|
*
|
|
* Contributed by Darren Hsieh <darren.hsieh@sifive.com>
|
|
* Contributed by Bruce Lai <bruce.lai@sifive.com>
|
|
*/
|
|
|
|
#include "libyuv/row.h"
|
|
|
|
// This module is for RVV (RISC-V Vector extension)
|
|
#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
|
|
#include <assert.h>
|
|
#include <riscv_vector.h>
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
#ifdef LIBYUV_RVV_HAS_VXRM_ARG
|
|
// Fill YUV -> RGB conversion constants into vectors
|
|
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
|
|
{ \
|
|
ub = yuvconst->kUVCoeff[0]; \
|
|
vr = yuvconst->kUVCoeff[1]; \
|
|
ug = yuvconst->kUVCoeff[2]; \
|
|
vg = yuvconst->kUVCoeff[3]; \
|
|
yg = yuvconst->kRGBCoeffBias[0]; \
|
|
bb = yuvconst->kRGBCoeffBias[1] + 32; \
|
|
bg = yuvconst->kRGBCoeffBias[2] - 32; \
|
|
br = yuvconst->kRGBCoeffBias[3] + 32; \
|
|
}
|
|
#else
|
|
// Fill YUV -> RGB conversion constants into vectors
|
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
|
// register) is set to round-to-nearest-up mode(0).
|
|
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
|
|
{ \
|
|
asm volatile("csrwi vxrm, 0"); \
|
|
ub = yuvconst->kUVCoeff[0]; \
|
|
vr = yuvconst->kUVCoeff[1]; \
|
|
ug = yuvconst->kUVCoeff[2]; \
|
|
vg = yuvconst->kUVCoeff[3]; \
|
|
yg = yuvconst->kRGBCoeffBias[0]; \
|
|
bb = yuvconst->kRGBCoeffBias[1] + 32; \
|
|
bg = yuvconst->kRGBCoeffBias[2] - 32; \
|
|
br = yuvconst->kRGBCoeffBias[3] + 32; \
|
|
}
|
|
#endif
|
|
// Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
|
|
#define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
|
|
{ \
|
|
vuint8m1_t v_tmp0, v_tmp1; \
|
|
vuint8m2_t v_y; \
|
|
vuint16m2_t v_u_16, v_v_16; \
|
|
vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
|
|
v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \
|
|
v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
|
|
v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \
|
|
v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
|
|
v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
|
|
v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
|
|
v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
|
|
v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
|
|
vl = __riscv_vsetvl_e8m2(w); \
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl); \
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
|
|
}
|
|
|
|
// Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
|
|
#define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
|
|
{ \
|
|
vuint8m2_t v_y; \
|
|
vl = __riscv_vsetvl_e8m2(w); \
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl); \
|
|
v_u = __riscv_vle8_v_u8m2(src_u, vl); \
|
|
v_v = __riscv_vle8_v_u8m2(src_v, vl); \
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
|
|
}
|
|
|
|
// Convert from YUV to fixed point RGB
|
|
#define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
|
|
v_b_16, v_r_16) \
|
|
{ \
|
|
vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
|
|
vuint32m8_t v_tmp5; \
|
|
v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \
|
|
v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \
|
|
v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \
|
|
v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \
|
|
v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \
|
|
v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \
|
|
v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \
|
|
v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \
|
|
v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \
|
|
v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \
|
|
v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \
|
|
v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \
|
|
}
|
|
|
|
#ifdef LIBYUV_RVV_HAS_VXRM_ARG
|
|
// Convert from fixed point RGB To 8 bit RGB
|
|
#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
|
|
{ \
|
|
v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, __RISCV_VXRM_RNU, vl); \
|
|
v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, __RISCV_VXRM_RNU, vl); \
|
|
v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, __RISCV_VXRM_RNU, vl); \
|
|
}
|
|
#else
|
|
// Convert from fixed point RGB To 8 bit RGB
|
|
#define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
|
|
{ \
|
|
v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \
|
|
v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \
|
|
v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \
|
|
}
|
|
#endif
|
|
|
|
// Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
|
|
#define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \
|
|
{ \
|
|
vuint8m1x2_t v_tmp; \
|
|
vuint8m1_t v_tmp0, v_tmp1; \
|
|
vuint8m2_t v_y; \
|
|
vuint16m2_t v_u_16, v_v_16; \
|
|
vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
|
|
v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_uv, vl); \
|
|
v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \
|
|
v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \
|
|
v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
|
|
v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
|
|
v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
|
|
v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
|
|
v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
|
|
v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
|
|
vl = __riscv_vsetvl_e8m2(w); \
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl); \
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
|
|
}
|
|
|
|
// Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
|
|
#define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \
|
|
{ \
|
|
vuint8m1x2_t v_tmp; \
|
|
vuint8m1_t v_tmp0, v_tmp1; \
|
|
vuint8m2_t v_y; \
|
|
vuint16m2_t v_u_16, v_v_16; \
|
|
vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
|
|
v_tmp = __riscv_vlseg2e8_v_u8m1x2(src_vu, vl); \
|
|
v_tmp0 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 0); \
|
|
v_tmp1 = __riscv_vget_v_u8m1x2_u8m1(v_tmp, 1); \
|
|
v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
|
|
v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
|
|
v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
|
|
v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
|
|
v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
|
|
v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
|
|
vl = __riscv_vsetvl_e8m2(w); \
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl); \
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
|
|
}
|
|
|
|
#ifdef HAS_ARGBTOAR64ROW_RVV
|
|
void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
|
|
size_t avl = (size_t)4 * width;
|
|
do {
|
|
vuint16m8_t v_ar64;
|
|
vuint8m4_t v_argb;
|
|
size_t vl = __riscv_vsetvl_e8m4(avl);
|
|
v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
|
|
v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
|
|
v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
|
|
__riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
|
|
avl -= vl;
|
|
src_argb += vl;
|
|
dst_ar64 += vl;
|
|
} while (avl > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOAB64ROW_RVV
|
|
void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
|
|
size_t avl = (size_t)width;
|
|
do {
|
|
vuint16m2x4_t v_dst_ab64;
|
|
vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
|
|
size_t vl = __riscv_vsetvl_e8m1(avl);
|
|
vuint8m1x4_t v_src_argb = __riscv_vlseg4e8_v_u8m1x4(src_argb, vl);
|
|
vuint8m1_t v_b = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 0);
|
|
vuint8m1_t v_g = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 1);
|
|
vuint8m1_t v_r = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 2);
|
|
vuint8m1_t v_a = __riscv_vget_v_u8m1x4_u8m1(v_src_argb, 3);
|
|
v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
|
|
v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
|
|
v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
|
|
v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
|
|
v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
|
|
v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
|
|
v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
|
|
v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
|
|
v_dst_ab64 = __riscv_vcreate_v_u16m2x4(v_r_16, v_g_16, v_b_16, v_a_16);
|
|
__riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_ab64, vl);
|
|
avl -= vl;
|
|
src_argb += 4 * vl;
|
|
dst_ab64 += 4 * vl;
|
|
} while (avl > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_AR64TOARGBROW_RVV
|
|
void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
|
|
size_t avl = (size_t)4 * width;
|
|
do {
|
|
vuint16m8_t v_ar64;
|
|
vuint8m4_t v_argb;
|
|
size_t vl = __riscv_vsetvl_e16m8(avl);
|
|
v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
|
|
v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
|
|
__riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
|
|
avl -= vl;
|
|
src_ar64 += vl;
|
|
dst_argb += vl;
|
|
} while (avl > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_AR64TOAB64ROW_RVV
|
|
void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
|
|
uint16_t* dst_ab64,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e16m2(w);
|
|
vuint16m2x4_t v_argb16 = __riscv_vlseg4e16_v_u16m2x4(src_ar64, vl);
|
|
vuint16m2_t v_b = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 0);
|
|
vuint16m2_t v_g = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 1);
|
|
vuint16m2_t v_r = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 2);
|
|
vuint16m2_t v_a = __riscv_vget_v_u16m2x4_u16m2(v_argb16, 3);
|
|
vuint16m2x4_t v_dst_abgr = __riscv_vcreate_v_u16m2x4(v_r, v_g, v_b, v_a);
|
|
__riscv_vsseg4e16_v_u16m2x4(dst_ab64, v_dst_abgr, vl);
|
|
w -= vl;
|
|
src_ar64 += vl * 4;
|
|
dst_ab64 += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_AB64TOARGBROW_RVV
|
|
void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
|
|
size_t avl = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e16m2(avl);
|
|
vuint16m2x4_t v_abgr16 = __riscv_vlseg4e16_v_u16m2x4(src_ab64, vl);
|
|
vuint16m2_t v_r_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 0);
|
|
vuint16m2_t v_g_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 1);
|
|
vuint16m2_t v_b_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 2);
|
|
vuint16m2_t v_a_16 = __riscv_vget_v_u16m2x4_u16m2(v_abgr16, 3);
|
|
vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
|
|
vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
|
|
vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
|
|
vuint8m1_t v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
|
|
vuint8m1x4_t v_dst_argb = __riscv_vcreate_v_u8m1x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m1x4(dst_argb, v_dst_argb, vl);
|
|
avl -= vl;
|
|
src_ab64 += 4 * vl;
|
|
dst_argb += 4 * vl;
|
|
} while (avl > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RAWTOARGBROW_RVV
|
|
void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
|
|
vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_raw += vl * 3;
|
|
dst_argb += vl * 4;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RAWTORGBAROW_RVV
|
|
void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
|
|
vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
|
|
w -= vl;
|
|
src_raw += vl * 3;
|
|
dst_rgba += vl * 4;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RAWTORGB24ROW_RVV
|
|
void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x3_t v_bgr = __riscv_vlseg3e8_v_u8m2x3(src_raw, vl);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 1);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_bgr, 2);
|
|
vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_raw += vl * 3;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTORAWROW_RVV
|
|
void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_raw += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTORGB24ROW_RVV
|
|
void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_rgb24,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2x3_t v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOABGRROW_RVV
|
|
void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
|
|
vuint8m2x4_t v_dst_abgr = __riscv_vcreate_v_u8m2x4(v_r, v_g, v_b, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_abgr, v_dst_abgr, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_abgr += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOBGRAROW_RVV
|
|
void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
|
|
vuint8m2x4_t v_dst_bgra = __riscv_vcreate_v_u8m2x4(v_a, v_r, v_g, v_b);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_bgra, v_dst_bgra, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_bgra += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTORGBAROW_RVV
|
|
void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
|
|
vuint8m2x4_t v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_rgba += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RGBATOARGBROW_RVV
|
|
void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
|
|
vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_rgba += vl * 4;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RGB24TOARGBROW_RVV
|
|
void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
|
|
uint8_t* dst_argb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb24, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2);
|
|
vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_rgb24 += vl * 3;
|
|
dst_argb += vl * 4;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I444TOARGBROW_RVV
|
|
void I444ToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
vuint8m2x4_t v_dst_argb;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_u += vl;
|
|
src_v += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I444ALPHATOARGBROW_RVV
|
|
void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
const uint8_t* src_a,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t vl;
|
|
size_t w = (size_t)width;
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
vuint8m2x4_t v_dst_argb;
|
|
READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
v_a = __riscv_vle8_v_u8m2(src_a, vl);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_a += vl;
|
|
src_u += vl;
|
|
src_v += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I444TORGB24ROW_RVV
|
|
void I444ToRGB24Row_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_rgb24,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t vl;
|
|
size_t w = (size_t)width;
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
vuint8m2x3_t v_dst_rgb;
|
|
READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_u += vl;
|
|
src_v += vl;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I422TOARGBROW_RVV
|
|
void I422ToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
vuint8m2x4_t v_dst_argb;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_u += vl / 2;
|
|
src_v += vl / 2;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I422ALPHATOARGBROW_RVV
|
|
void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
const uint8_t* src_a,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t vl;
|
|
size_t w = (size_t)width;
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
vuint8m2x4_t v_dst_argb;
|
|
READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
v_a = __riscv_vle8_v_u8m2(src_a, vl);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_a += vl;
|
|
src_u += vl / 2;
|
|
src_v += vl / 2;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I422TORGBAROW_RVV
|
|
void I422ToRGBARow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_rgba,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
vuint8m2x4_t v_dst_rgba;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_rgba = __riscv_vcreate_v_u8m2x4(v_a, v_b, v_g, v_r);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_rgba, v_dst_rgba, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_u += vl / 2;
|
|
src_v += vl / 2;
|
|
dst_rgba += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I422TORGB24ROW_RVV
|
|
void I422ToRGB24Row_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_rgb24,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t vl;
|
|
size_t w = (size_t)width;
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
vuint8m2x3_t v_dst_rgb;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_u += vl / 2;
|
|
src_v += vl / 2;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_I400TOARGBROW_RVV
|
|
#if defined(LIBYUV_RVV_HAS_VXRM_ARG)
|
|
void I400ToARGBRow_RVV(const uint8_t* src_y,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
|
|
vuint8m2x4_t v_dst_argb;
|
|
vuint16m4_t v_yb;
|
|
if (is_yb_positive) {
|
|
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
|
|
} else {
|
|
v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
|
|
}
|
|
do {
|
|
vuint8m2_t v_y, v_out;
|
|
vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl);
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
|
|
v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y
|
|
v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
|
|
if (is_yb_positive) {
|
|
v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
|
|
} else {
|
|
v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
|
|
}
|
|
v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, __RISCV_VXRM_RNU, vl);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_out, v_out, v_out, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#else
|
|
void I400ToARGBRow_RVV(const uint8_t* src_y,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
vuint16m4_t v_yb;
|
|
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
|
|
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
|
// register) sets to round-to-nearest-up mode(0).
|
|
asm volatile("csrwi vxrm, 0");
|
|
if (is_yb_positive) {
|
|
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
|
|
} else {
|
|
v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
|
|
}
|
|
do {
|
|
vuint8m2_t v_y, v_out;
|
|
vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
v_y = __riscv_vle8_v_u8m2(src_y, vl);
|
|
v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
|
|
v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y
|
|
v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
|
|
if (is_yb_positive) {
|
|
v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
|
|
} else {
|
|
v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
|
|
}
|
|
v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
|
|
__riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef HAS_J400TOARGBROW_RVV
|
|
void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
vuint8m2_t v_y = __riscv_vle8_v_u8m2(src_y, vl);
|
|
vuint8m2x4_t v_dst_argb = __riscv_vcreate_v_u8m2x4(v_y, v_y, v_y, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
dst_argb += vl * 4;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_COPYROW_RVV
|
|
void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(w);
|
|
vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
|
|
__riscv_vse8_v_u8m8(dst, v_data, vl);
|
|
w -= vl;
|
|
src += vl;
|
|
dst += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_NV12TOARGBROW_RVV
|
|
void NV12ToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_uv,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
vuint8m2x4_t v_dst_argb;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_uv += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_NV12TORGB24ROW_RVV
|
|
void NV12ToRGB24Row_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_uv,
|
|
uint8_t* dst_rgb24,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r;
|
|
vuint8m2x3_t v_dst_rgb;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_uv += vl;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_NV21TOARGBROW_RVV
|
|
void NV21ToARGBRow_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_vu,
|
|
uint8_t* dst_argb,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r, v_a;
|
|
vuint8m2x4_t v_dst_argb;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_vu += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_NV21TORGB24ROW_RVV
|
|
void NV21ToRGB24Row_RVV(const uint8_t* src_y,
|
|
const uint8_t* src_vu,
|
|
uint8_t* dst_rgb24,
|
|
const struct YuvConstants* yuvconstants,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
uint8_t ub, vr, ug, vg;
|
|
int16_t yg, bb, bg, br;
|
|
vuint8m2_t v_u, v_v;
|
|
vuint8m2_t v_b, v_g, v_r;
|
|
vuint8m2x3_t v_dst_rgb;
|
|
vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
|
|
YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
|
|
do {
|
|
READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
|
|
YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
|
|
v_b_16, v_r_16);
|
|
RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
|
|
v_dst_rgb = __riscv_vcreate_v_u8m2x3(v_b, v_g, v_r);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb24, v_dst_rgb, vl);
|
|
w -= vl;
|
|
src_y += vl;
|
|
src_vu += vl;
|
|
dst_rgb24 += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
// Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
|
|
#ifdef HAS_INTERPOLATEROW_RVV
|
|
#ifdef LIBYUV_RVV_HAS_VXRM_ARG
|
|
void InterpolateRow_RVV(uint8_t* dst_ptr,
|
|
const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
int dst_width,
|
|
int source_y_fraction) {
|
|
int y1_fraction = source_y_fraction;
|
|
int y0_fraction = 256 - y1_fraction;
|
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
|
size_t dst_w = (size_t)dst_width;
|
|
assert(source_y_fraction >= 0);
|
|
assert(source_y_fraction < 256);
|
|
// Blend 100 / 0 - Copy row unchanged.
|
|
if (y1_fraction == 0) {
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(dst_w);
|
|
__riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
return;
|
|
}
|
|
// Blend 50 / 50.
|
|
if (y1_fraction == 128) {
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(dst_w);
|
|
vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
|
|
vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
|
|
vuint8m8_t row_out =
|
|
__riscv_vaaddu_vv_u8m8(row0, row1, __RISCV_VXRM_RNU, vl);
|
|
__riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
src_ptr1 += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
return;
|
|
}
|
|
// General purpose row blend.
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m4(dst_w);
|
|
vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
|
|
vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
|
|
vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
|
|
acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
|
|
__riscv_vse8_v_u8m4(
|
|
dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, __RISCV_VXRM_RNU, vl), vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
src_ptr1 += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
}
|
|
#else
|
|
void InterpolateRow_RVV(uint8_t* dst_ptr,
|
|
const uint8_t* src_ptr,
|
|
ptrdiff_t src_stride,
|
|
int dst_width,
|
|
int source_y_fraction) {
|
|
int y1_fraction = source_y_fraction;
|
|
int y0_fraction = 256 - y1_fraction;
|
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
|
size_t dst_w = (size_t)dst_width;
|
|
assert(source_y_fraction >= 0);
|
|
assert(source_y_fraction < 256);
|
|
// Blend 100 / 0 - Copy row unchanged.
|
|
if (y1_fraction == 0) {
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(dst_w);
|
|
__riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
return;
|
|
}
|
|
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
|
// register) is set to round-to-nearest-up(0).
|
|
asm volatile("csrwi vxrm, 0");
|
|
// Blend 50 / 50.
|
|
if (y1_fraction == 128) {
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(dst_w);
|
|
vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
|
|
vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
|
|
// Use round-to-nearest-up mode for averaging add
|
|
vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
|
|
__riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
src_ptr1 += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
return;
|
|
}
|
|
// General purpose row blend.
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m4(dst_w);
|
|
vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
|
|
vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
|
|
vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
|
|
acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
|
|
// Use round-to-nearest-up mode for vnclip
|
|
__riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
|
|
dst_w -= vl;
|
|
src_ptr += vl;
|
|
src_ptr1 += vl;
|
|
dst_ptr += vl;
|
|
} while (dst_w > 0);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
#ifdef HAS_SPLITRGBROW_RVV
|
|
void SplitRGBRow_RVV(const uint8_t* src_rgb,
|
|
uint8_t* dst_r,
|
|
uint8_t* dst_g,
|
|
uint8_t* dst_b,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x3_t v_src = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src, 1);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src, 2);
|
|
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
|
|
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
|
|
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
|
|
w -= vl;
|
|
dst_r += vl;
|
|
dst_g += vl;
|
|
dst_b += vl;
|
|
src_rgb += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MERGERGBROW_RVV
|
|
void MergeRGBRow_RVV(const uint8_t* src_r,
|
|
const uint8_t* src_g,
|
|
const uint8_t* src_b,
|
|
uint8_t* dst_rgb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
|
|
vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
|
|
vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
|
|
vuint8m2x3_t v_dst = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
|
|
__riscv_vsseg3e8_v_u8m2x3(dst_rgb, v_dst, vl);
|
|
w -= vl;
|
|
src_r += vl;
|
|
src_g += vl;
|
|
src_b += vl;
|
|
dst_rgb += vl * 3;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_SPLITARGBROW_RVV
|
|
void SplitARGBRow_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_r,
|
|
uint8_t* dst_g,
|
|
uint8_t* dst_b,
|
|
uint8_t* dst_a,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src, 3);
|
|
__riscv_vse8_v_u8m2(dst_a, v_a, vl);
|
|
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
|
|
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
|
|
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
|
|
w -= vl;
|
|
dst_a += vl;
|
|
dst_r += vl;
|
|
dst_g += vl;
|
|
dst_b += vl;
|
|
src_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MERGEARGBROW_RVV
|
|
void MergeARGBRow_RVV(const uint8_t* src_r,
|
|
const uint8_t* src_g,
|
|
const uint8_t* src_b,
|
|
const uint8_t* src_a,
|
|
uint8_t* dst_argb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
|
|
vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
|
|
vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
|
|
vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
|
|
vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl);
|
|
w -= vl;
|
|
src_r += vl;
|
|
src_g += vl;
|
|
src_b += vl;
|
|
src_a += vl;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_SPLITXRGBROW_RVV
|
|
void SplitXRGBRow_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_r,
|
|
uint8_t* dst_g,
|
|
uint8_t* dst_b,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src, 2);
|
|
__riscv_vse8_v_u8m2(dst_r, v_r, vl);
|
|
__riscv_vse8_v_u8m2(dst_g, v_g, vl);
|
|
__riscv_vse8_v_u8m2(dst_b, v_b, vl);
|
|
w -= vl;
|
|
dst_r += vl;
|
|
dst_g += vl;
|
|
dst_b += vl;
|
|
src_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MERGEXRGBROW_RVV
|
|
void MergeXRGBRow_RVV(const uint8_t* src_r,
|
|
const uint8_t* src_g,
|
|
const uint8_t* src_b,
|
|
uint8_t* dst_argb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
|
|
do {
|
|
vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
|
|
vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
|
|
vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
|
|
vuint8m2x4_t v_dst = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst, vl);
|
|
w -= vl;
|
|
src_r += vl;
|
|
src_g += vl;
|
|
src_b += vl;
|
|
dst_argb += vl * 4;
|
|
vl = __riscv_vsetvl_e8m2(w);
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_SPLITUVROW_RVV
|
|
void SplitUVRow_RVV(const uint8_t* src_uv,
|
|
uint8_t* dst_u,
|
|
uint8_t* dst_v,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
|
vuint8m4x2_t v_src = __riscv_vlseg2e8_v_u8m4x2(src_uv, vl);
|
|
vuint8m4_t v_u = __riscv_vget_v_u8m4x2_u8m4(v_src, 0);
|
|
vuint8m4_t v_v = __riscv_vget_v_u8m4x2_u8m4(v_src, 1);
|
|
__riscv_vse8_v_u8m4(dst_u, v_u, vl);
|
|
__riscv_vse8_v_u8m4(dst_v, v_v, vl);
|
|
w -= vl;
|
|
dst_u += vl;
|
|
dst_v += vl;
|
|
src_uv += 2 * vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MERGEUVROW_RVV
|
|
void MergeUVRow_RVV(const uint8_t* src_u,
|
|
const uint8_t* src_v,
|
|
uint8_t* dst_uv,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
|
vuint8m4_t v_u = __riscv_vle8_v_u8m4(src_u, vl);
|
|
vuint8m4_t v_v = __riscv_vle8_v_u8m4(src_v, vl);
|
|
vuint8m4x2_t v_dst = __riscv_vcreate_v_u8m4x2(v_u, v_v);
|
|
__riscv_vsseg2e8_v_u8m4x2(dst_uv, v_dst, vl);
|
|
w -= vl;
|
|
src_u += vl;
|
|
src_v += vl;
|
|
dst_uv += 2 * vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
// RGB to JPeg coefficients
|
|
// B * 0.1140 coefficient = 29
|
|
// G * 0.5870 coefficient = 150
|
|
// R * 0.2990 coefficient = 77
|
|
// Add 0.5 = 0x80
|
|
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}};
|
|
|
|
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}};
|
|
|
|
// RGB to BT.601 coefficients
|
|
// B * 0.1016 coefficient = 25
|
|
// G * 0.5078 coefficient = 129
|
|
// R * 0.2578 coefficient = 66
|
|
// Add 16.5 = 0x1080
|
|
|
|
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}};
|
|
|
|
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}};
|
|
|
|
// ARGB expects first 3 values to contain RGB and 4th value is ignored
|
|
#ifdef HAS_ARGBTOYMATRIXROW_RVV
|
|
void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_y,
|
|
int width,
|
|
const struct ArgbConstants* c) {
|
|
assert(width != 0);
|
|
size_t w = (size_t)width;
|
|
vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
|
|
vuint16m4_t v_addy; // vector is to store kAddY
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl);
|
|
v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl);
|
|
v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl);
|
|
v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl);
|
|
do {
|
|
vuint8m2_t v_y;
|
|
vuint16m4_t v_y_u16;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
|
|
v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
|
|
v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
|
|
__riscv_vse8_v_u8m2(dst_y, v_y, vl);
|
|
w -= vl;
|
|
src_argb += 4 * vl;
|
|
dst_y += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOYROW_RVV
|
|
void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|
ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOYJROW_RVV
|
|
void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
|
|
ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ABGRTOYROW_RVV
|
|
void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
|
ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ABGRTOYJROW_RVV
|
|
void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
|
|
ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
|
|
}
|
|
#endif
|
|
|
|
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
|
#ifdef HAS_RGBATOYMATRIXROW_RVV
|
|
static void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
|
|
uint8_t* dst_y,
|
|
int width,
|
|
const struct ArgbConstants* c) {
|
|
assert(width != 0);
|
|
size_t w = (size_t)width;
|
|
vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
|
|
vuint16m4_t v_addy; // vector is to store kAddY
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl);
|
|
v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl);
|
|
v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl);
|
|
v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl);
|
|
do {
|
|
vuint8m2_t v_y;
|
|
vuint16m4_t v_y_u16;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);
|
|
v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
|
|
v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
|
|
v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
|
|
__riscv_vse8_v_u8m2(dst_y, v_y, vl);
|
|
w -= vl;
|
|
src_rgba += 4 * vl;
|
|
dst_y += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RGBATOYROW_RVV
|
|
void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
|
RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RGBATOYJROW_RVV
|
|
void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
|
RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_BGRATOYROW_RVV
|
|
void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
|
RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_RGBTOYMATRIXROW_RVV
|
|
static void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
|
|
uint8_t* dst_y,
|
|
int width,
|
|
const struct ArgbConstants* c) {
|
|
assert(width != 0);
|
|
size_t w = (size_t)width;
|
|
vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
|
|
vuint16m4_t v_addy; // vector is to store kAddY
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
v_by = __riscv_vmv_v_x_u8m2(c->kRGBToY[0], vl);
|
|
v_gy = __riscv_vmv_v_x_u8m2(c->kRGBToY[1], vl);
|
|
v_ry = __riscv_vmv_v_x_u8m2(c->kRGBToY[2], vl);
|
|
v_addy = __riscv_vmv_v_x_u16m4(c->kAddY[0], vl);
|
|
do {
|
|
vuint8m2_t v_y;
|
|
vuint16m4_t v_y_u16;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x3_t v_src_rgb = __riscv_vlseg3e8_v_u8m2x3(src_rgb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x3_u8m2(v_src_rgb, 2);
|
|
v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
|
|
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
|
|
v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
|
|
v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
|
|
__riscv_vse8_v_u8m2(dst_y, v_y, vl);
|
|
w -= vl;
|
|
src_rgb += 3 * vl;
|
|
dst_y += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// Blend src_argb over src_argb1 and store to dst_argb.
|
|
// dst_argb may be src_argb or src_argb1.
|
|
// src_argb: RGB values have already been pre-multiplied by the a.
|
|
#ifdef HAS_ARGBBLENDROW_RVV
|
|
void ARGBBlendRow_RVV(const uint8_t* src_argb,
|
|
const uint8_t* src_argb1,
|
|
uint8_t* dst_argb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
size_t vl = __riscv_vsetvlmax_e8m2();
|
|
// clamp255((((256 - a) * b) >> 8) + f)
|
|
// = b * (256 - a) / 256 + f
|
|
// = b - (b * a / 256) + f
|
|
vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
|
|
do {
|
|
vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
|
|
vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
|
|
vuint8m2x4_t v_dst_argb;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src0_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_src0_b = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 0);
|
|
vuint8m2_t v_src0_g = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 1);
|
|
vuint8m2_t v_src0_r = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 2);
|
|
vuint8m2_t v_src0_a = __riscv_vget_v_u8m2x4_u8m2(v_src0_argb, 3);
|
|
vuint8m2x4_t v_src1_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb1, vl);
|
|
vuint8m2_t v_src1_b = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 0);
|
|
vuint8m2_t v_src1_g = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 1);
|
|
vuint8m2_t v_src1_r = __riscv_vget_v_u8m2x4_u8m2(v_src1_argb, 2);
|
|
|
|
v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
|
|
v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
|
|
v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
|
|
|
|
v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
|
|
v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
|
|
v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
|
|
|
|
v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
|
|
v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
|
|
v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
|
|
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_dst_b, v_dst_g, v_dst_r, v_255);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
|
|
w -= vl;
|
|
src_argb += 4 * vl;
|
|
src_argb1 += 4 * vl;
|
|
dst_argb += 4 * vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_BLENDPLANEROW_RVV
|
|
void BlendPlaneRow_RVV(const uint8_t* src0,
|
|
const uint8_t* src1,
|
|
const uint8_t* alpha,
|
|
uint8_t* dst,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
vuint16m8_t v_dst_u16;
|
|
vuint8m4_t v_dst;
|
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
|
vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
|
|
vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
|
|
vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
|
|
vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
|
|
|
|
// (a * foreground) + (1-a) * background
|
|
v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
|
|
v_dst_u16 =
|
|
__riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
|
|
v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
|
|
v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
|
|
|
|
__riscv_vse8_v_u8m4(dst, v_dst, vl);
|
|
w -= vl;
|
|
src0 += vl;
|
|
src1 += vl;
|
|
alpha += vl;
|
|
dst += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
// Attenuate: (f * a + 255) >> 8
|
|
#ifdef HAS_ARGBATTENUATEROW_RVV
|
|
void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_argb,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
|
|
vuint8m2x4_t v_dst_argb;
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
|
|
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
|
|
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
|
|
// f * a
|
|
v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
|
|
v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
|
|
v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
|
|
// f * a + 255
|
|
v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
|
|
v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
|
|
v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
|
|
// (f * a + 255) >> 8
|
|
v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
|
|
v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
|
|
v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
|
|
|
|
v_dst_argb = __riscv_vcreate_v_u8m2x4(v_b, v_g, v_r, v_a);
|
|
__riscv_vsseg4e8_v_u8m2x4(dst_argb, v_dst_argb, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_argb += vl * 4;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBEXTRACTALPHAROW_RVV
|
|
void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
|
|
uint8_t* dst_a,
|
|
int width) {
|
|
size_t w = (size_t)width;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m2(w);
|
|
vuint8m2x4_t v_src_argb = __riscv_vlseg4e8_v_u8m2x4(src_argb, vl);
|
|
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
|
|
__riscv_vse8_v_u8m2(dst_a, v_a, vl);
|
|
w -= vl;
|
|
src_argb += vl * 4;
|
|
dst_a += vl;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
|
|
void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
|
|
size_t w = (size_t)width;
|
|
const ptrdiff_t dst_stride = 4;
|
|
dst += 3;
|
|
do {
|
|
size_t vl = __riscv_vsetvl_e8m8(w);
|
|
vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
|
|
__riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
|
|
w -= vl;
|
|
src += vl;
|
|
dst += vl * dst_stride;
|
|
} while (w > 0);
|
|
}
|
|
#endif
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|
|
|
|
#endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
|