mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
Add supports for loongarch LSX and LASX.
1. Add supports for LSX and LASX. 2. Three optimization functions are added in loongarch/row_lasx.cc file: I422ToARGBRow_LASX,I422ToRGBARow_LASX,I422AlphaToARGBRow_LASX. Bug: libyuv:912, Bug: libyuv:913 Change-Id: I043c2704f99a5215724b5c0b7f97e6bf5f7a199b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3329189 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
90ffd5cba9
commit
51de1e16f2
@ -37,6 +37,10 @@ By default the cpu is detected and the most advanced form of SIMD is used. But
|
||||
LIBYUV_DISABLE_MSA
|
||||
LIBYUV_DISABLE_MMI
|
||||
|
||||
## LOONGARCH CPUs
|
||||
LIBYUV_DISABLE_LSX
|
||||
LIBYUV_DISABLE_LASX
|
||||
|
||||
# Test Width/Height/Repeat
|
||||
|
||||
The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions.
|
||||
|
||||
@ -51,6 +51,11 @@ static const int kCpuHasMIPS = 0x400000;
|
||||
static const int kCpuHasMSA = 0x800000;
|
||||
static const int kCpuHasMMI = 0x1000000;
|
||||
|
||||
// These flags are only valid on LOONGARCH processors.
|
||||
static const int kCpuHasLOONGARCH = 0x2000000;
|
||||
static const int kCpuHasLSX = 0x4000000;
|
||||
static const int kCpuHasLASX = 0x8000000;
|
||||
|
||||
// Optional init function. TestCpuFlag does an auto-init.
|
||||
// Returns cpu_info flags.
|
||||
LIBYUV_API
|
||||
@ -74,6 +79,8 @@ LIBYUV_API
|
||||
int ArmCpuCaps(const char* cpuinfo_name);
|
||||
LIBYUV_API
|
||||
int MipsCpuCaps(const char* cpuinfo_name);
|
||||
LIBYUV_API
|
||||
int LoongarchCpuCaps(void);
|
||||
|
||||
// For testing, allow CPU flags to be disabled.
|
||||
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
|
||||
|
||||
1862
include/libyuv/loongson_intrinsics.h
Normal file
1862
include/libyuv/loongson_intrinsics.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -684,6 +684,13 @@ extern "C" {
|
||||
#define HAS_YUY2TOYROW_MMI
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
|
||||
#define HAS_I422TOARGBROW_LASX
|
||||
#define HAS_I422TORGBAROW_LASX
|
||||
#define HAS_I422ALPHATOARGBROW_LASX
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
|
||||
#if defined(VISUALC_HAS_AVX2)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(32)) var
|
||||
@ -954,12 +961,24 @@ void I422ToARGBRow_MSA(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_MSA(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_MMI(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
@ -973,6 +992,13 @@ void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
const uint8_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGB24Row_MSA(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
@ -4221,6 +4247,12 @@ void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_Any_LASX(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToARGBRow_Any_MMI(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
@ -4233,6 +4265,12 @@ void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGBARow_Any_LASX(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
@ -4240,6 +4278,13 @@ void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422AlphaToARGBRow_Any_LASX(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
const uint8_t* a_buf,
|
||||
uint8_t* dst_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
|
||||
const uint8_t* u_buf,
|
||||
const uint8_t* v_buf,
|
||||
|
||||
@ -121,6 +121,14 @@ int I420ToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
|
||||
@ -360,6 +368,14 @@ int I422ToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
|
||||
@ -1869,6 +1885,14 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422ALPHATOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
|
||||
@ -2000,6 +2024,14 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422ALPHATOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422AlphaToARGBRow = I422AlphaToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422AlphaToARGBRow = I422AlphaToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
|
||||
@ -4299,6 +4331,14 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToRGBARow = I422ToRGBARow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToRGBARow = I422ToRGBARow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
|
||||
@ -4504,6 +4544,14 @@ int I420ToRGBAMatrix(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TORGBAROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToRGBARow = I422ToRGBARow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToRGBARow = I422ToRGBARow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width);
|
||||
@ -5189,6 +5237,14 @@ int I420ToRGB565Dither(const uint8_t* src_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2;
|
||||
|
||||
@ -197,6 +197,31 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
|
||||
return flag;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Consider read_loongarch_ir().
|
||||
#define LOONGARCH_CFG2 0x2
|
||||
#define LOONGARCH_CFG2_LSX (1 << 6)
|
||||
#define LOONGARCH_CFG2_LASX (1 << 7)
|
||||
|
||||
#if defined(__loongarch__) && defined(__linux__)
|
||||
LIBYUV_API SAFEBUFFERS int LoongarchCpuCaps(void) {
|
||||
int flag = 0x0;
|
||||
uint32_t cfg2 = 0;
|
||||
|
||||
__asm__ volatile(
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(cfg2)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (cfg2 & LOONGARCH_CFG2_LSX)
|
||||
flag |= kCpuHasLSX;
|
||||
|
||||
if (cfg2 & LOONGARCH_CFG2_LASX)
|
||||
flag |= kCpuHasLASX;
|
||||
return flag;
|
||||
}
|
||||
#endif
|
||||
|
||||
static SAFEBUFFERS int GetCpuFlags(void) {
|
||||
int cpu_info = 0;
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
@ -240,6 +265,10 @@ static SAFEBUFFERS int GetCpuFlags(void) {
|
||||
cpu_info = MipsCpuCaps("/proc/cpuinfo");
|
||||
cpu_info |= kCpuHasMIPS;
|
||||
#endif
|
||||
#if defined(__loongarch__) && defined(__linux__)
|
||||
cpu_info = LoongarchCpuCaps();
|
||||
cpu_info |= kCpuHasLOONGARCH;
|
||||
#endif
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
// gcc -mfpu=neon defines __ARM_NEON__
|
||||
// __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon.
|
||||
|
||||
@ -119,6 +119,9 @@ ANY41C(I444AlphaToARGBRow_Any_MMI, I444AlphaToARGBRow_MMI, 0, 0, 4, 7)
|
||||
#ifdef HAS_I422ALPHATOARGBROW_MMI
|
||||
ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422ALPHATOARGBROW_LASX
|
||||
ANY41C(I422AlphaToARGBRow_Any_LASX, I422AlphaToARGBRow_LASX, 1, 0, 4, 15)
|
||||
#endif
|
||||
#undef ANY41C
|
||||
|
||||
// Any 4 planes to 1 plane of 8 bit with yuvconstants
|
||||
@ -419,6 +422,10 @@ ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_LASX
|
||||
ANY31C(I422ToARGBRow_Any_LASX, I422ToARGBRow_LASX, 1, 0, 4, 31)
|
||||
ANY31C(I422ToRGBARow_Any_LASX, I422ToRGBARow_LASX, 1, 0, 4, 31)
|
||||
#endif
|
||||
#undef ANY31C
|
||||
|
||||
// Any 3 planes of 16 bit to 1 with yuvconstants
|
||||
|
||||
303
source/row_lasx.cc
Normal file
303
source/row_lasx.cc
Normal file
@ -0,0 +1,303 @@
|
||||
/*
|
||||
* Copyright 2022 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Copyright (c) 2022 Loongson Technology Corporation Limited
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
|
||||
#include "libyuv/loongson_intrinsics.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ALPHA_VAL (-1)
|
||||
|
||||
// Fill YUV -> RGB conversion constants into vectors
|
||||
#define YUVTORGB_SETUP(yuvconst, ubvr, ugvg, yg, yb) \
|
||||
{ \
|
||||
__m256i ub, vr, ug, vg; \
|
||||
\
|
||||
ub = __lasx_xvreplgr2vr_h(yuvconst->kUVToB[0]); \
|
||||
vr = __lasx_xvreplgr2vr_h(yuvconst->kUVToR[1]); \
|
||||
ug = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[0]); \
|
||||
vg = __lasx_xvreplgr2vr_h(yuvconst->kUVToG[1]); \
|
||||
yg = __lasx_xvreplgr2vr_h(yuvconst->kYToRgb[0]); \
|
||||
yb = __lasx_xvreplgr2vr_w(yuvconst->kYBiasToRgb[0]); \
|
||||
ubvr = __lasx_xvilvl_h(ub, vr); \
|
||||
ugvg = __lasx_xvilvl_h(ug, vg); \
|
||||
}
|
||||
|
||||
// Load 32 YUV422 pixel data
|
||||
#define READYUV422_D(psrc_y, psrc_u, psrc_v, out_y, uv_l, uv_h) \
|
||||
{ \
|
||||
__m256i temp0, temp1; \
|
||||
\
|
||||
DUP2_ARG2(__lasx_xvld, psrc_y, 0, psrc_u, 0, out_y, temp0); \
|
||||
temp1 = __lasx_xvld(psrc_v, 0); \
|
||||
temp0 = __lasx_xvsub_b(temp0, const_0x80); \
|
||||
temp1 = __lasx_xvsub_b(temp1, const_0x80); \
|
||||
temp0 = __lasx_vext2xv_h_b(temp0); \
|
||||
temp1 = __lasx_vext2xv_h_b(temp1); \
|
||||
uv_l = __lasx_xvilvl_h(temp0, temp1); \
|
||||
uv_h = __lasx_xvilvh_h(temp0, temp1); \
|
||||
}
|
||||
|
||||
// Load 16 YUV422 pixel data
|
||||
#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, uv) \
|
||||
{ \
|
||||
__m256i temp0, temp1; \
|
||||
\
|
||||
out_y = __lasx_xvld(psrc_y, 0); \
|
||||
temp0 = __lasx_xvldrepl_d(psrc_u, 0); \
|
||||
temp1 = __lasx_xvldrepl_d(psrc_v, 0); \
|
||||
uv = __lasx_xvilvl_b(temp0, temp1); \
|
||||
uv = __lasx_xvsub_b(uv, const_0x80); \
|
||||
uv = __lasx_vext2xv_h_b(uv); \
|
||||
}
|
||||
|
||||
// Convert 16 pixels of YUV420 to RGB.
|
||||
#define YUVTORGB_D(in_y, in_uvl, in_uvh, ubvr, ugvg, \
|
||||
yg, yb, b_l, b_h, g_l, g_h, r_l, r_h) \
|
||||
{ \
|
||||
__m256i u_l, u_h, v_l, v_h; \
|
||||
__m256i yl_ev, yl_od, yh_ev, yh_od; \
|
||||
__m256i temp0, temp1, temp2, temp3; \
|
||||
\
|
||||
temp0 = __lasx_xvilvl_b(in_y, in_y); \
|
||||
temp1 = __lasx_xvilvh_b(in_y, in_y); \
|
||||
yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
|
||||
yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
|
||||
yh_ev = __lasx_xvmulwev_w_hu_h(temp1, yg); \
|
||||
yh_od = __lasx_xvmulwod_w_hu_h(temp1, yg); \
|
||||
DUP4_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yh_ev, 16, yh_od, 16, \
|
||||
yl_ev, yl_od, yh_ev, yh_od); \
|
||||
yl_ev = __lasx_xvadd_w(yl_ev, yb); \
|
||||
yl_od = __lasx_xvadd_w(yl_od, yb); \
|
||||
yh_ev = __lasx_xvadd_w(yh_ev, yb); \
|
||||
yh_od = __lasx_xvadd_w(yh_od, yb); \
|
||||
v_l = __lasx_xvmulwev_w_h(in_uvl, ubvr); \
|
||||
u_l = __lasx_xvmulwod_w_h(in_uvl, ubvr); \
|
||||
v_h = __lasx_xvmulwev_w_h(in_uvh, ubvr); \
|
||||
u_h = __lasx_xvmulwod_w_h(in_uvh, ubvr); \
|
||||
temp0 = __lasx_xvadd_w(yl_ev, u_l); \
|
||||
temp1 = __lasx_xvadd_w(yl_od, u_l); \
|
||||
temp2 = __lasx_xvadd_w(yh_ev, u_h); \
|
||||
temp3 = __lasx_xvadd_w(yh_od, u_h); \
|
||||
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
b_l = __lasx_xvpackev_h(temp1, temp0); \
|
||||
b_h = __lasx_xvpackev_h(temp3, temp2); \
|
||||
temp0 = __lasx_xvadd_w(yl_ev, v_l); \
|
||||
temp1 = __lasx_xvadd_w(yl_od, v_l); \
|
||||
temp2 = __lasx_xvadd_w(yh_ev, v_h); \
|
||||
temp3 = __lasx_xvadd_w(yh_od, v_h); \
|
||||
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
r_l = __lasx_xvpackev_h(temp1, temp0); \
|
||||
r_h = __lasx_xvpackev_h(temp3, temp2); \
|
||||
DUP2_ARG2(__lasx_xvdp2_w_h, in_uvl, ugvg, in_uvh, ugvg, u_l, u_h); \
|
||||
temp0 = __lasx_xvsub_w(yl_ev, u_l); \
|
||||
temp1 = __lasx_xvsub_w(yl_od, u_l); \
|
||||
temp2 = __lasx_xvsub_w(yh_ev, u_h); \
|
||||
temp3 = __lasx_xvsub_w(yh_od, u_h); \
|
||||
DUP4_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp2, 6, temp3, 6, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3, \
|
||||
temp0, temp1, temp2, temp3); \
|
||||
g_l = __lasx_xvpackev_h(temp1, temp0); \
|
||||
g_h = __lasx_xvpackev_h(temp3, temp2); \
|
||||
}
|
||||
|
||||
// Convert 8 pixels of YUV420 to RGB.
|
||||
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, \
|
||||
yg, yb, out_b, out_g, out_r) \
|
||||
{ \
|
||||
__m256i u_l, v_l, yl_ev, yl_od; \
|
||||
__m256i temp0, temp1; \
|
||||
\
|
||||
in_y = __lasx_xvpermi_d(in_y, 0xD8); \
|
||||
temp0 = __lasx_xvilvl_b(in_y, in_y); \
|
||||
yl_ev = __lasx_xvmulwev_w_hu_h(temp0, yg); \
|
||||
yl_od = __lasx_xvmulwod_w_hu_h(temp0, yg); \
|
||||
DUP2_ARG2(__lasx_xvsrai_w, yl_ev, 16, yl_od, 16, yl_ev, yl_od); \
|
||||
yl_ev = __lasx_xvadd_w(yl_ev, yb); \
|
||||
yl_od = __lasx_xvadd_w(yl_od, yb); \
|
||||
v_l = __lasx_xvmulwev_w_h(in_uv, ubvr); \
|
||||
u_l = __lasx_xvmulwod_w_h(in_uv, ubvr); \
|
||||
temp0 = __lasx_xvadd_w(yl_ev, u_l); \
|
||||
temp1 = __lasx_xvadd_w(yl_od, u_l); \
|
||||
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
|
||||
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
|
||||
out_b = __lasx_xvpackev_h(temp1, temp0); \
|
||||
temp0 = __lasx_xvadd_w(yl_ev, v_l); \
|
||||
temp1 = __lasx_xvadd_w(yl_od, v_l); \
|
||||
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
|
||||
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
|
||||
out_r = __lasx_xvpackev_h(temp1, temp0); \
|
||||
u_l = __lasx_xvdp2_w_h(in_uv, ugvg); \
|
||||
temp0 = __lasx_xvsub_w(yl_ev, u_l); \
|
||||
temp1 = __lasx_xvsub_w(yl_od, u_l); \
|
||||
DUP2_ARG2(__lasx_xvsrai_w, temp0, 6, temp1, 6, temp0, temp1); \
|
||||
DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1); \
|
||||
out_g = __lasx_xvpackev_h(temp1, temp0); \
|
||||
}
|
||||
|
||||
// Pack and Store 16 ARGB values.
|
||||
#define STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, \
|
||||
b_l, b_h, pdst_argb) \
|
||||
{ \
|
||||
__m256i temp0, temp1, temp2, temp3; \
|
||||
\
|
||||
temp0 = __lasx_xvpackev_b(g_l, b_l); \
|
||||
temp1 = __lasx_xvpackev_b(a_l, r_l); \
|
||||
temp2 = __lasx_xvpackev_b(g_h, b_h); \
|
||||
temp3 = __lasx_xvpackev_b(a_h, r_h); \
|
||||
r_l = __lasx_xvilvl_h(temp1, temp0); \
|
||||
r_h = __lasx_xvilvh_h(temp1, temp0); \
|
||||
g_l = __lasx_xvilvl_h(temp3, temp2); \
|
||||
g_h = __lasx_xvilvh_h(temp3, temp2); \
|
||||
temp0 = __lasx_xvpermi_q(r_h, r_l, 0x20); \
|
||||
temp1 = __lasx_xvpermi_q(g_h, g_l, 0x20); \
|
||||
temp2 = __lasx_xvpermi_q(r_h, r_l, 0x31); \
|
||||
temp3 = __lasx_xvpermi_q(g_h, g_l, 0x31); \
|
||||
__lasx_xvst(temp0, pdst_argb, 0); \
|
||||
__lasx_xvst(temp1, pdst_argb, 32); \
|
||||
__lasx_xvst(temp2, pdst_argb, 64); \
|
||||
__lasx_xvst(temp3, pdst_argb, 96); \
|
||||
pdst_argb += 128; \
|
||||
}
|
||||
|
||||
// Pack and Store 8 ARGB values.
|
||||
#define STOREARGB(in_a, in_r, in_g, in_b, pdst_argb) \
|
||||
{ \
|
||||
__m256i temp0, temp1; \
|
||||
\
|
||||
temp0 = __lasx_xvpackev_b(in_g, in_b); \
|
||||
temp1 = __lasx_xvpackev_b(in_a, in_r); \
|
||||
in_a = __lasx_xvilvl_h(temp1, temp0); \
|
||||
in_r = __lasx_xvilvh_h(temp1, temp0); \
|
||||
temp0 = __lasx_xvpermi_q(in_r, in_a, 0x20); \
|
||||
temp1 = __lasx_xvpermi_q(in_r, in_a, 0x31); \
|
||||
__lasx_xvst(temp0, pdst_argb, 0); \
|
||||
__lasx_xvst(temp1, pdst_argb, 32); \
|
||||
pdst_argb += 64; \
|
||||
}
|
||||
|
||||
void I422ToARGBRow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
int x;
|
||||
int len = width / 32;
|
||||
__m256i vec_yb, vec_yg;
|
||||
__m256i vec_ubvr, vec_ugvg;
|
||||
__m256i alpha = __lasx_xvldi(0xFF);
|
||||
__m256i const_0x80 = __lasx_xvldi(0x80);
|
||||
|
||||
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
|
||||
|
||||
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
|
||||
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
|
||||
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
|
||||
STOREARGB_D(alpha, alpha, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
|
||||
src_y += 32;
|
||||
src_u += 16;
|
||||
src_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void I422ToRGBARow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
int x;
|
||||
int len = width / 32;
|
||||
__m256i vec_yb, vec_yg;
|
||||
__m256i vec_ubvr, vec_ugvg;
|
||||
__m256i alpha = __lasx_xvldi(0xFF);
|
||||
__m256i const_0x80 = __lasx_xvldi(0x80);
|
||||
|
||||
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h;
|
||||
|
||||
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
|
||||
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
|
||||
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
|
||||
STOREARGB_D(r_l, r_h, g_l, g_h, b_l, b_h, alpha, alpha, dst_argb);
|
||||
src_y += 32;
|
||||
src_u += 16;
|
||||
src_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void I422AlphaToARGBRow_LASX(const uint8_t* src_y,
|
||||
const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
const uint8_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
int x;
|
||||
int len = width / 32;
|
||||
int res = width & 31;
|
||||
__m256i vec_yb, vec_yg;
|
||||
__m256i vec_ubvr, vec_ugvg;
|
||||
__m256i zero = __lasx_xvldi(0);
|
||||
__m256i const_0x80 = __lasx_xvldi(0x80);
|
||||
|
||||
YUVTORGB_SETUP(yuvconstants, vec_ubvr, vec_ugvg, vec_yg, vec_yb);
|
||||
|
||||
for (x = 0; x < len; x++) {
|
||||
__m256i y, uv_l, uv_h, b_l, b_h, g_l, g_h, r_l, r_h, a_l, a_h;
|
||||
|
||||
y = __lasx_xvld(src_a, 0);
|
||||
a_l = __lasx_xvilvl_b(zero, y);
|
||||
a_h = __lasx_xvilvh_b(zero, y);
|
||||
READYUV422_D(src_y, src_u, src_v, y, uv_l, uv_h);
|
||||
YUVTORGB_D(y, uv_l, uv_h, vec_ubvr, vec_ugvg, vec_yg,
|
||||
vec_yb, b_l, b_h, g_l, g_h, r_l, r_h);
|
||||
STOREARGB_D(a_l, a_h, r_l, r_h, g_l, g_h, b_l, b_h, dst_argb);
|
||||
src_y += 32;
|
||||
src_u += 16;
|
||||
src_v += 16;
|
||||
src_a += 32;
|
||||
}
|
||||
if (res) {
|
||||
__m256i y, uv, r, g, b, a;
|
||||
a = __lasx_xvld(src_a, 0);
|
||||
a = __lasx_vext2xv_hu_bu(a);
|
||||
READYUV422(src_y, src_u, src_v, y, uv);
|
||||
YUVTORGB(y, uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, b, g, r);
|
||||
STOREARGB(a, r, g, b, dst_argb);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_LASX) && defined(__loongarch_asx)
|
||||
@ -643,6 +643,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_LASX;
|
||||
if (IS_ALIGNED(src_width, 32)) {
|
||||
I422ToARGBRow = I422ToARGBRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
|
||||
@ -72,6 +72,15 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
|
||||
int has_mmi = TestCpuFlag(kCpuHasMMI);
|
||||
printf("Has MMI %d\n", has_mmi);
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch__)
|
||||
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
|
||||
printf("Has LOONGARCH %d\n", has_loongarch);
|
||||
int has_lsx = TestCpuFlag(kCpuHasLSX);
|
||||
printf("Has LSX %d\n", has_lsx);
|
||||
int has_lasx = TestCpuFlag(kCpuHasLASX);
|
||||
printf("Has LASX %d\n", has_lasx);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_F(LibYUVBaseTest, TestCompilerMacros) {
|
||||
@ -151,6 +160,9 @@ TEST_F(LibYUVBaseTest, TestCompilerMacros) {
|
||||
#ifdef _MIPS_ARCH_LOONGSON3A
|
||||
printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A);
|
||||
#endif
|
||||
#ifdef __loongarch__
|
||||
printf("__loongarch__ %d\n", __loongarch__);
|
||||
#endif
|
||||
#ifdef _WIN32
|
||||
printf("_WIN32 %d\n", _WIN32);
|
||||
#endif
|
||||
|
||||
@ -81,6 +81,16 @@ int TestCpuEnv(int cpu_info) {
|
||||
cpu_info &= ~libyuv::kCpuHasMMI;
|
||||
}
|
||||
#endif
|
||||
#if defined(__longarch__) && defined(__linux__)
|
||||
if (TestEnv("LIBYUV_DISABLE_LSX")) {
|
||||
cpu_info &= ~libyuv::kCpuHasLSX;
|
||||
}
|
||||
#endif
|
||||
#if defined(__longarch__) && defined(__linux__)
|
||||
if (TestEnv("LIBYUV_DISABLE_LASX")) {
|
||||
cpu_info &= ~libyuv::kCpuHasLASX;
|
||||
}
|
||||
#endif
|
||||
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
|
||||
(defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
|
||||
defined(_M_IX86))
|
||||
|
||||
@ -23,6 +23,7 @@ int main(int argc, const char* argv[]) {
|
||||
int has_arm = TestCpuFlag(kCpuHasARM);
|
||||
int has_mips = TestCpuFlag(kCpuHasMIPS);
|
||||
int has_x86 = TestCpuFlag(kCpuHasX86);
|
||||
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
|
||||
(void)argc;
|
||||
(void)argv;
|
||||
|
||||
@ -65,6 +66,7 @@ int main(int argc, const char* argv[]) {
|
||||
printf("Has ARM %x\n", has_arm);
|
||||
printf("Has MIPS %x\n", has_mips);
|
||||
printf("Has X86 %x\n", has_x86);
|
||||
printf("Has LOONGARCH %x\n", has_loongarch);
|
||||
if (has_arm) {
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
printf("Has NEON %x\n", has_neon);
|
||||
@ -75,6 +77,12 @@ int main(int argc, const char* argv[]) {
|
||||
int has_mmi = TestCpuFlag(kCpuHasMMI);
|
||||
printf("Has MMI %x\n", has_mmi);
|
||||
}
|
||||
if (has_loongarch) {
|
||||
int has_lsx = TestCpuFlag(kCpuHasLSX);
|
||||
printf("Has LSX %x\n", has_lsx);
|
||||
int has_lasx = TestCpuFlag(kCpuHasLASX);
|
||||
printf("Has LASX %x\n", has_lasx);
|
||||
}
|
||||
if (has_x86) {
|
||||
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
|
||||
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user