Add supports for loongarch LSX and LASX.

1. Add supports for LSX and LASX.
2. Three optimization functions are added in loongarch/row_lasx.cc file:
   I422ToARGBRow_LASX,I422ToRGBARow_LASX,I422AlphaToARGBRow_LASX.

Bug: libyuv:912, Bug: libyuv:913
Change-Id: I043c2704f99a5215724b5c0b7f97e6bf5f7a199b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3329189
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Hao Chen 2020-11-06 14:35:20 +08:00 committed by libyuv LUCI CQ
parent 90ffd5cba9
commit 51de1e16f2
12 changed files with 2351 additions and 0 deletions

View File

@ -37,6 +37,10 @@ By default the cpu is detected and the most advanced form of SIMD is used. But
LIBYUV_DISABLE_MSA
LIBYUV_DISABLE_MMI
## LOONGARCH CPUs
LIBYUV_DISABLE_LSX
LIBYUV_DISABLE_LASX
# Test Width/Height/Repeat
The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions.

View File

@ -51,6 +51,11 @@ static const int kCpuHasMIPS = 0x400000;
static const int kCpuHasMSA = 0x800000;
static const int kCpuHasMMI = 0x1000000;
// These flags are only valid on LOONGARCH processors.
static const int kCpuHasLOONGARCH = 0x2000000;
static const int kCpuHasLSX = 0x4000000;
static const int kCpuHasLASX = 0x8000000;
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.
LIBYUV_API
@ -74,6 +79,8 @@ LIBYUV_API
int ArmCpuCaps(const char* cpuinfo_name);
LIBYUV_API
int MipsCpuCaps(const char* cpuinfo_name);
LIBYUV_API
int LoongarchCpuCaps(void);
// For testing, allow CPU flags to be disabled.
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.

File diff suppressed because it is too large Load Diff

View File

@ -684,6 +684,13 @@ extern "C" {
#define HAS_YUY2TOYROW_MMI
#endif
#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
#define HAS_I422TOARGBROW_LASX
#define HAS_I422TORGBAROW_LASX
#define HAS_I422ALPHATOARGBROW_LASX
#endif
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
#if defined(VISUALC_HAS_AVX2)
#define SIMD_ALIGNED(var) __declspec(align(32)) var
@ -954,12 +961,24 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_MMI(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -973,6 +992,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_MSA(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
@ -4221,6 +4247,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@ -4233,6 +4265,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
@ -4240,6 +4278,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
const uint8_t* a_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,

View File

@ -121,6 +121,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToARGBRow = I422ToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_LASX;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -360,6 +368,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToARGBRow = I422ToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_LASX;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
@ -1869,6 +1885,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 16)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@ -2000,6 +2024,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422ALPHATOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 16)) {
I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
}
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
@ -4299,6 +4331,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TORGBAROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToRGBARow = I422ToRGBARow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
I422ToRGBARow = I422ToRGBARow_LASX;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@ -4504,6 +4544,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TORGBAROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToRGBARow = I422ToRGBARow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
I422ToRGBARow = I422ToRGBARow_LASX;
}
}
#endif
for (y = 0; y < height; ++y) {
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
@ -5189,6 +5237,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
}
}
#endif
#if defined(HAS_I422TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToARGBRow = I422ToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
I422ToARGBRow = I422ToARGBRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;

View File

@ -197,6 +197,31 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
return flag;
}
// TODO(fbarchard): Consider read_loongarch_ir().
#define LOONGARCH_CFG2 0x2
#define LOONGARCH_CFG2_LSX (1 << 6)
#define LOONGARCH_CFG2_LASX (1 << 7)
#if defined(__loongarch__) && defined(__linux__)
LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
int flag = 0x0;
uint32_t cfg2 = 0;
__asm__ volatile(
"cpucfg %0, %1 \n\t"
: "+&r"(cfg2)
: "r"(LOONGARCH_CFG2)
);
if (cfg2 & LOONGARCH_CFG2_LSX)
flag |= kCpuHasLSX;
if (cfg2 & LOONGARCH_CFG2_LASX)
flag |= kCpuHasLASX;
return flag;
}
#endif
static SAFEBUFFERS int GetCpuFlags(void) {
int cpu_info = 0;
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
@ -240,6 +265,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
cpu_info = MipsCpuCaps("/proc/cpuinfo");
cpu_info |= kCpuHasMIPS;
#endif
#if defined(__loongarch__) && defined(__linux__)
cpu_info = LoongarchCpuCaps();
cpu_info |= kCpuHasLOONGARCH;
#endif
#if defined(__arm__) || defined(__aarch64__)
// gcc -mfpu=neon defines __ARM_NEON__
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.

View File

@ -119,6 +119,9 @@ ANY41C(I444AlphaToARGBRow_Any_MMI, I444AlphaToARGBRow_MMI, 0, 0, 4, 7)
#ifdef HAS_I422ALPHATOARGBROW_MMI
ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
#endif
#ifdef HAS_I422ALPHATOARGBROW_LASX
ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
#endif
#undef ANY41C
// Any 4 planes to 1 plane of 8 bit with yuvconstants
@ -419,6 +422,10 @@ ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
#endif
#ifdef HAS_I422TOARGBROW_LASX
ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
#endif
#undef ANY31C
// Any 3 planes of 16 bit to 1 with yuvconstants

303
source/row_lasx.cc Normal file
View File

@ -0,0 +1,303 @@
/*
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
*
* Copyright (c) 2022 Loongson Technology Corporation Limited
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
#include "libyuv/loongson_intrinsics.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
#define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors
#define YUVTORGB_SETUP(yuvconst, ubvr, ugvg, yg, yb) \
{ \
__m256i ub, vr, ug, vg; \
\
ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
ubvr = __lasx_xvilvl_h(ub, vr); \
ugvg = __lasx_xvilvl_h(ug, vg); \
}
// Load 32 YUV422 pixel data
#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
{ \
__m256i temp0, temp1; \
\
DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
temp1 = __lasx_xvld(psrc_v, 0); \
temp0 = __lasx_xvsub_b(temp0, const_0x80); \
temp1 = __lasx_xvsub_b(temp1, const_0x80); \
temp0 = __lasx_vext2xv_h_b(temp0); \
temp1 = __lasx_vext2xv_h_b(temp1); \
uv_l = __lasx_xvilvl_h(temp0, temp1); \
uv_h = __lasx_xvilvh_h(temp0, temp1); \
}
// Load 16 YUV422 pixel data
#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
{ \
__m256i temp0, temp1; \
\
out_y = __lasx_xvld(psrc_y, 0); \
temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
uv = __lasx_xvilvl_b(temp0, temp1); \
uv = __lasx_xvsub_b(uv, const_0x80); \
uv = __lasx_vext2xv_h_b(uv); \
}
// Convert 16 pixels of YUV420 to RGB.
#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, \
yg, yb, b_l, b_h, g_l, g_h, r_l, r_h) \
{ \
__m256i u_l, u_h, v_l, v_h; \
__m256i yl_ev, yl_od, yh_ev, yh_od; \
__m256i temp0, temp1, temp2, temp3; \
\
temp0 = __lasx_xvilvl_b(in_y, in_y); \
temp1 = __lasx_xvilvh_b(in_y, in_y); \
yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
yl_ev, yl_od, yh_ev, yh_od); \
yl_ev = __lasx_xvadd_w(yl_ev, yb); \
yl_od = __lasx_xvadd_w(yl_od, yb); \
yh_ev = __lasx_xvadd_w(yh_ev, yb); \
yh_od = __lasx_xvadd_w(yh_od, yb); \
v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
temp0 = __lasx_xvadd_w(yl_ev, u_l); \
temp1 = __lasx_xvadd_w(yl_od, u_l); \
temp2 = __lasx_xvadd_w(yh_ev, u_h); \
temp3 = __lasx_xvadd_w(yh_od, u_h); \
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
temp0, temp1, temp2, temp3); \
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
temp0, temp1, temp2, temp3); \
b_l = __lasx_xvpackev_h(temp1, temp0); \
b_h = __lasx_xvpackev_h(temp3, temp2); \
temp0 = __lasx_xvadd_w(yl_ev, v_l); \
temp1 = __lasx_xvadd_w(yl_od, v_l); \
temp2 = __lasx_xvadd_w(yh_ev, v_h); \
temp3 = __lasx_xvadd_w(yh_od, v_h); \
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
temp0, temp1, temp2, temp3); \
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
temp0, temp1, temp2, temp3); \
r_l = __lasx_xvpackev_h(temp1, temp0); \
r_h = __lasx_xvpackev_h(temp3, temp2); \
DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
temp0 = __lasx_xvsub_w(yl_ev, u_l); \
temp1 = __lasx_xvsub_w(yl_od, u_l); \
temp2 = __lasx_xvsub_w(yh_ev, u_h); \
temp3 = __lasx_xvsub_w(yh_od, u_h); \
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
temp0, temp1, temp2, temp3); \
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
temp0, temp1, temp2, temp3); \
g_l = __lasx_xvpackev_h(temp1, temp0); \
g_h = __lasx_xvpackev_h(temp3, temp2); \
}
// Convert 8 pixels of YUV420 to RGB.
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, \
yg, yb, out_b, out_g, out_r) \
{ \
__m256i u_l, v_l, yl_ev, yl_od; \
__m256i temp0, temp1; \
\
in_y = __lasx_xvpermi_d(in_y, 0xD8); \
temp0 = __lasx_xvilvl_b(in_y, in_y); \
yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
yl_ev = __lasx_xvadd_w(yl_ev, yb); \
yl_od = __lasx_xvadd_w(yl_od, yb); \
v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
temp0 = __lasx_xvadd_w(yl_ev, u_l); \
temp1 = __lasx_xvadd_w(yl_od, u_l); \
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
out_b = __lasx_xvpackev_h(temp1, temp0); \
temp0 = __lasx_xvadd_w(yl_ev, v_l); \
temp1 = __lasx_xvadd_w(yl_od, v_l); \
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
out_r = __lasx_xvpackev_h(temp1, temp0); \
u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
temp0 = __lasx_xvsub_w(yl_ev, u_l); \
temp1 = __lasx_xvsub_w(yl_od, u_l); \
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
out_g = __lasx_xvpackev_h(temp1, temp0); \
}
// Pack and Store 16 ARGB values.
#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, \
b_l, b_h, pdst_argb) \
{ \
__m256i temp0, temp1, temp2, temp3; \
\
temp0 = __lasx_xvpackev_b(g_l, b_l); \
temp1 = __lasx_xvpackev_b(a_l, r_l); \
temp2 = __lasx_xvpackev_b(g_h, b_h); \
temp3 = __lasx_xvpackev_b(a_h, r_h); \
r_l = __lasx_xvilvl_h(temp1, temp0); \
r_h = __lasx_xvilvh_h(temp1, temp0); \
g_l = __lasx_xvilvl_h(temp3, temp2); \
g_h = __lasx_xvilvh_h(temp3, temp2); \
temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
__lasx_xvst(temp0, pdst_argb, 0); \
__lasx_xvst(temp1, pdst_argb, 32); \
__lasx_xvst(temp2, pdst_argb, 64); \
__lasx_xvst(temp3, pdst_argb, 96); \
pdst_argb += 128; \
}
// Pack and Store 8 ARGB values.
#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
{ \
__m256i temp0, temp1; \
\
temp0 = __lasx_xvpackev_b(in_g, in_b); \
temp1 = __lasx_xvpackev_b(in_a, in_r); \
in_a = __lasx_xvilvl_h(temp1, temp0); \
in_r = __lasx_xvilvh_h(temp1, temp0); \
temp0 = __lasx_xvpermi_q(in_r, in_a, 0x20); \
temp1 = __lasx_xvpermi_q(in_r, in_a, 0x31); \
__lasx_xvst(temp0, pdst_argb, 0); \
__lasx_xvst(temp1, pdst_argb, 32); \
pdst_argb += 64; \
}
void I422ToARGBRow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
int len = width / 32;
__m256i vec_yb, vec_yg;
__m256i vec_ubvr, vec_ugvg;
__m256i alpha = __lasx_xvldi(0xFF);
__m256i const_0x80 = __lasx_xvldi(0x80);
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
for (x = 0; x < len; x++) {
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
src_y += 32;
src_u += 16;
src_v += 16;
}
}
void I422ToRGBARow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
int len = width / 32;
__m256i vec_yb, vec_yg;
__m256i vec_ubvr, vec_ugvg;
__m256i alpha = __lasx_xvldi(0xFF);
__m256i const_0x80 = __lasx_xvldi(0x80);
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
for (x = 0; x < len; x++) {
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
src_y += 32;
src_u += 16;
src_v += 16;
}
}
void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
const uint8_t* src_a,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
int x;
int len = width / 32;
int res = width & 31;
__m256i vec_yb, vec_yg;
__m256i vec_ubvr, vec_ugvg;
__m256i zero = __lasx_xvldi(0);
__m256i const_0x80 = __lasx_xvldi(0x80);
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
for (x = 0; x < len; x++) {
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
y = __lasx_xvld(src_a, 0);
a_l = __lasx_xvilvl_b(zero, y);
a_h = __lasx_xvilvh_b(zero, y);
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
src_y += 32;
src_u += 16;
src_v += 16;
src_a += 32;
}
if (res) {
__m256i y, uv, r, g, b, a;
a = __lasx_xvld(src_a, 0);
a = __lasx_vext2xv_hu_bu(a);
READYUV422(src_y, src_u, src_v, y, uv);
YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
STOREARGB(a, r, g, b, dst_argb);
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)

View File

@ -643,6 +643,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
}
}
#endif
#if defined(HAS_I422TOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
I422ToARGBRow = I422ToARGBRow_Any_LASX;
if (IS_ALIGNED(src_width, 32)) {
I422ToARGBRow = I422ToARGBRow_LASX;
}
}
#endif
void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
ptrdiff_t src_stride, int dst_width,

View File

@ -72,6 +72,15 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
int has_mmi = TestCpuFlag(kCpuHasMMI);
printf("Has MMI %d\n", has_mmi);
#endif
#if defined(__loongarch__)
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
printf("Has LOONGARCH %d\n", has_loongarch);
int has_lsx = TestCpuFlag(kCpuHasLSX);
printf("Has LSX %d\n", has_lsx);
int has_lasx = TestCpuFlag(kCpuHasLASX);
printf("Has LASX %d\n", has_lasx);
#endif
}
TEST_F(LibYUVBaseTest, TestCompilerMacros) {
@ -151,6 +160,9 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
#ifdef _MIPS_ARCH_LOONGSON3A
printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
#endif
#ifdef __loongarch__
printf("__loongarch__ %d\n", __loongarch__);
#endif
#ifdef _WIN32
printf("_WIN32 %d\n", _WIN32);
#endif

View File

@ -81,6 +81,16 @@ int TestCpuEnv(int cpu_info) {
cpu_info &= ~libyuv::kCpuHasMMI;
}
#endif
#if defined(__longarch__) && defined(__linux__)
if (TestEnv("LIBYUV_DISABLE_LSX")) {
cpu_info &= ~libyuv::kCpuHasLSX;
}
#endif
#if defined(__longarch__) && defined(__linux__)
if (TestEnv("LIBYUV_DISABLE_LASX")) {
cpu_info &= ~libyuv::kCpuHasLASX;
}
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
(defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86))

View File

@ -23,6 +23,7 @@ int main(int argc, const char* argv[]) {
int has_arm = TestCpuFlag(kCpuHasARM);
int has_mips = TestCpuFlag(kCpuHasMIPS);
int has_x86 = TestCpuFlag(kCpuHasX86);
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
(void)argc;
(void)argv;
@ -65,6 +66,7 @@ int main(int argc, const char* argv[]) {
printf("Has ARM %x\n", has_arm);
printf("Has MIPS %x\n", has_mips);
printf("Has X86 %x\n", has_x86);
printf("Has LOONGARCH %x\n", has_loongarch);
if (has_arm) {
int has_neon = TestCpuFlag(kCpuHasNEON);
printf("Has NEON %x\n", has_neon);
@ -75,6 +77,12 @@ int main(int argc, const char* argv[]) {
int has_mmi = TestCpuFlag(kCpuHasMMI);
printf("Has MMI %x\n", has_mmi);
}
if (has_loongarch) {
int has_lsx = TestCpuFlag(kCpuHasLSX);
printf("Has LSX %x\n", has_lsx);
int has_lasx = TestCpuFlag(kCpuHasLASX);
printf("Has LASX %x\n", has_lasx);
}
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);