mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Add MSA optimized remaining scale row functions
R=fbarchard@google.com BUG=libyuv:634 Performance Gain (vs C vectorized) ScaleRowDown2_MSA - ~22.3x ScaleRowDown2_Any_MSA - ~19.9x ScaleRowDown2Linear_MSA - ~31.2x ScaleRowDown2Linear_Any_MSA - ~29.4x ScaleRowDown2Box_MSA - ~20.1x ScaleRowDown2Box_Any_MSA - ~19.6x ScaleRowDown4_MSA - ~11.7x ScaleRowDown4_Any_MSA - ~11.2x ScaleRowDown4Box_MSA - ~15.1x ScaleRowDown4Box_Any_MSA - ~15.1x ScaleRowDown38_MSA - ~1x ScaleRowDown38_Any_MSA - ~1x ScaleRowDown38_2_Box_MSA - ~1.7x ScaleRowDown38_2_Box_Any_MSA - ~1.7x ScaleRowDown38_3_Box_MSA - ~1.7x ScaleRowDown38_3_Box_Any_MSA - ~1.7x ScaleAddRow_MSA - ~1.2x ScaleAddRow_Any_MSA - ~1.15x Performance Gain (vs C non-vectorized) ScaleRowDown2_MSA - ~22.4x ScaleRowDown2_Any_MSA - ~19.8x ScaleRowDown2Linear_MSA - ~31.6x ScaleRowDown2Linear_Any_MSA - ~29.4x ScaleRowDown2Box_MSA - ~20.1x ScaleRowDown2Box_Any_MSA - ~19.6x ScaleRowDown4_MSA - ~11.7x ScaleRowDown4_Any_MSA - ~11.2x ScaleRowDown4Box_MSA - ~15.1x ScaleRowDown4Box_Any_MSA - ~15.1x ScaleRowDown38_MSA - ~3.2x ScaleRowDown38_Any_MSA - ~3.2x ScaleRowDown38_2_Box_MSA - ~2.4x ScaleRowDown38_2_Box_Any_MSA - ~2.3x ScaleRowDown38_3_Box_MSA - ~2.9x ScaleRowDown38_3_Box_Any_MSA - ~2.8x ScaleAddRow_MSA - ~8x ScaleAddRow_Any_MSA - ~7.46x Review-Url: https://codereview.chromium.org/2559683002 .
This commit is contained in:
parent
bd10875846
commit
288bfbefb5
@ -51,30 +51,28 @@
|
||||
})
|
||||
#endif // (__mips == 64)
|
||||
|
||||
#define SW(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \
|
||||
uint32_t val_m = (val); \
|
||||
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
|
||||
\
|
||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
#define SW(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||
uint32_t val_m = (val); \
|
||||
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
|
||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
})
|
||||
|
||||
#if (__mips == 64)
|
||||
#define SD(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
|
||||
uint64_t val_m = (val); \
|
||||
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
|
||||
\
|
||||
: [pdst_sd_m] "=m"(*pdst_sd_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
#define SD(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||
uint64_t val_m = (val); \
|
||||
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
|
||||
: [pdst_sd_m] "=m"(*pdst_sd_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
})
|
||||
#else // !(__mips == 64)
|
||||
#define SD(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||
uint32_t val0_m, val1_m; \
|
||||
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
|
||||
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
|
||||
@ -118,18 +116,18 @@
|
||||
})
|
||||
#endif // (__mips == 64)
|
||||
|
||||
#define SW(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); \
|
||||
uint32_t val_m = (val); \
|
||||
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
|
||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
#define SW(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||
uint32_t val_m = (val); \
|
||||
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
|
||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||
: [val_m] "r"(val_m)); \
|
||||
})
|
||||
|
||||
#define SD(val, pdst) \
|
||||
({ \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); \
|
||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||
uint32_t val0_m, val1_m; \
|
||||
val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
|
||||
val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
|
||||
@ -145,6 +143,9 @@
|
||||
#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
|
||||
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
|
||||
|
||||
#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
|
||||
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
|
||||
|
||||
/* Description : Load two vectors with 16 'byte' sized elements
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
@ -186,6 +187,18 @@
|
||||
}
|
||||
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 8 halfword elements with stride
|
||||
Arguments : Inputs - in0, in1, pdst, stride
|
||||
Details : Store 8 halfword elements from 'in0' to (pdst)
|
||||
Store 8 halfword elements from 'in1' to (pdst + stride)
|
||||
*/
|
||||
#define ST_H2(RTYPE, in0, in1, pdst, stride) \
|
||||
{ \
|
||||
ST_H(RTYPE, in0, (pdst)); \
|
||||
ST_H(RTYPE, in1, (pdst) + stride); \
|
||||
}
|
||||
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
|
||||
|
||||
// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
|
||||
/* Description : Shuffle byte vector elements as per mask vector
|
||||
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
|
||||
|
||||
@ -106,6 +106,10 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_SCALEARGBROWDOWN2_MSA
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_MSA
|
||||
#define HAS_SCALEROWDOWN2_MSA
|
||||
#define HAS_SCALEROWDOWN4_MSA
|
||||
#define HAS_SCALEROWDOWN38_MSA
|
||||
#define HAS_SCALEADDROW_MSA
|
||||
#endif
|
||||
|
||||
// Scale ARGB vertically with bilinear interpolation.
|
||||
@ -843,6 +847,75 @@ void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
|
||||
uint8* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
|
||||
void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -107,6 +107,21 @@ static void ScalePlaneDown2(int src_width,
|
||||
ScaleRowDown2 = filtering ? ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN2_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleRowDown2_Any_MSA
|
||||
: (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
|
||||
: ScaleRowDown2Box_Any_MSA);
|
||||
if (IS_ALIGNED(dst_width, 32)) {
|
||||
ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
|
||||
: (filtering == kFilterLinear
|
||||
? ScaleRowDown2Linear_MSA
|
||||
: ScaleRowDown2Box_MSA);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (filtering == kFilterLinear) {
|
||||
src_stride = 0;
|
||||
@ -232,6 +247,15 @@ static void ScalePlaneDown4(int src_width,
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN4_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleRowDown4 =
|
||||
filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
|
||||
if (IS_ALIGNED(dst_width, 16)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (filtering == kFilterLinear) {
|
||||
src_stride = 0;
|
||||
@ -567,6 +591,26 @@ static void ScalePlaneDown38(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN38_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
|
||||
}
|
||||
if (dst_width % 12 == 0) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_MSA;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_MSA;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < dst_height - 2; y += 3) {
|
||||
ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
|
||||
@ -842,6 +886,14 @@ static void ScalePlaneBox(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEADDROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleAddRow = ScaleAddRow_Any_MSA;
|
||||
if (IS_ALIGNED(src_width, 16)) {
|
||||
ScaleAddRow = ScaleAddRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (j = 0; j < dst_height; ++j) {
|
||||
int boxheight;
|
||||
|
||||
@ -135,6 +135,21 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN2_MSA
|
||||
SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
|
||||
SDANY(ScaleRowDown2Linear_Any_MSA,
|
||||
ScaleRowDown2Linear_MSA,
|
||||
ScaleRowDown2Linear_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
SDANY(ScaleRowDown2Box_Any_MSA,
|
||||
ScaleRowDown2Box_MSA,
|
||||
ScaleRowDown2Box_C,
|
||||
2,
|
||||
1,
|
||||
31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_SSSE3
|
||||
SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
|
||||
SDANY(ScaleRowDown4Box_Any_SSSE3,
|
||||
@ -162,6 +177,15 @@ SDANY(ScaleRowDown4Box_Any_NEON,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN4_MSA
|
||||
SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
|
||||
SDANY(ScaleRowDown4Box_Any_MSA,
|
||||
ScaleRowDown4Box_MSA,
|
||||
ScaleRowDown4Box_C,
|
||||
4,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN34_SSSE3
|
||||
SDANY(ScaleRowDown34_Any_SSSE3,
|
||||
ScaleRowDown34_SSSE3,
|
||||
@ -242,6 +266,26 @@ SDANY(ScaleRowDown38_2_Box_Any_NEON,
|
||||
1,
|
||||
11)
|
||||
#endif
|
||||
#ifdef HAS_SCALEROWDOWN38_MSA
|
||||
SDANY(ScaleRowDown38_Any_MSA,
|
||||
ScaleRowDown38_MSA,
|
||||
ScaleRowDown38_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_3_Box_Any_MSA,
|
||||
ScaleRowDown38_3_Box_MSA,
|
||||
ScaleRowDown38_3_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
SDANY(ScaleRowDown38_2_Box_Any_MSA,
|
||||
ScaleRowDown38_2_Box_MSA,
|
||||
ScaleRowDown38_2_Box_C,
|
||||
8 / 3,
|
||||
1,
|
||||
11)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEARGBROWDOWN2_SSE2
|
||||
SDANY(ScaleARGBRowDown2_Any_SSE2,
|
||||
@ -374,6 +418,9 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_MSA
|
||||
SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -8,6 +8,8 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
// This module is for GCC MSA
|
||||
@ -169,6 +171,373 @@ void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < dst_width; x += 32) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
|
||||
dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
|
||||
dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
|
||||
ST_UB2(dst0, dst1, dst, 16);
|
||||
src_ptr += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < dst_width; x += 32) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
|
||||
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
|
||||
vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
|
||||
vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
|
||||
vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
|
||||
dst0 = __msa_aver_u_b(vec1, vec0);
|
||||
dst1 = __msa_aver_u_b(vec3, vec2);
|
||||
ST_UB2(dst0, dst1, dst, 16);
|
||||
src_ptr += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
const uint8_t* s = src_ptr;
|
||||
const uint8_t* t = src_ptr + src_stride;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
|
||||
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
|
||||
for (x = 0; x < dst_width; x += 32) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
|
||||
src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
|
||||
src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
|
||||
vec0 = __msa_hadd_u_h(src0, src0);
|
||||
vec1 = __msa_hadd_u_h(src1, src1);
|
||||
vec2 = __msa_hadd_u_h(src2, src2);
|
||||
vec3 = __msa_hadd_u_h(src3, src3);
|
||||
vec0 += __msa_hadd_u_h(src4, src4);
|
||||
vec1 += __msa_hadd_u_h(src5, src5);
|
||||
vec2 += __msa_hadd_u_h(src6, src6);
|
||||
vec3 += __msa_hadd_u_h(src7, src7);
|
||||
vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
|
||||
vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
|
||||
vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
|
||||
vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
|
||||
ST_UB2(dst0, dst1, dst, 16);
|
||||
s += 64;
|
||||
t += 64;
|
||||
dst += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown4_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
|
||||
|
||||
for (x = 0; x < dst_width; x += 16) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
|
||||
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
|
||||
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
|
||||
dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
|
||||
ST_UB(dst0, dst);
|
||||
src_ptr += 64;
|
||||
dst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x;
|
||||
const uint8_t* s = src_ptr;
|
||||
const uint8_t* t0 = s + src_stride;
|
||||
const uint8_t* t1 = s + src_stride * 2;
|
||||
const uint8_t* t2 = s + src_stride * 3;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
|
||||
v8u16 vec0, vec1, vec2, vec3;
|
||||
v4u32 reg0, reg1, reg2, reg3;
|
||||
|
||||
for (x = 0; x < dst_width; x += 16) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
|
||||
src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
|
||||
src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
|
||||
vec0 = __msa_hadd_u_h(src0, src0);
|
||||
vec1 = __msa_hadd_u_h(src1, src1);
|
||||
vec2 = __msa_hadd_u_h(src2, src2);
|
||||
vec3 = __msa_hadd_u_h(src3, src3);
|
||||
vec0 += __msa_hadd_u_h(src4, src4);
|
||||
vec1 += __msa_hadd_u_h(src5, src5);
|
||||
vec2 += __msa_hadd_u_h(src6, src6);
|
||||
vec3 += __msa_hadd_u_h(src7, src7);
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
|
||||
src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
|
||||
src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
|
||||
vec0 += __msa_hadd_u_h(src0, src0);
|
||||
vec1 += __msa_hadd_u_h(src1, src1);
|
||||
vec2 += __msa_hadd_u_h(src2, src2);
|
||||
vec3 += __msa_hadd_u_h(src3, src3);
|
||||
vec0 += __msa_hadd_u_h(src4, src4);
|
||||
vec1 += __msa_hadd_u_h(src5, src5);
|
||||
vec2 += __msa_hadd_u_h(src6, src6);
|
||||
vec3 += __msa_hadd_u_h(src7, src7);
|
||||
reg0 = __msa_hadd_u_w(vec0, vec0);
|
||||
reg1 = __msa_hadd_u_w(vec1, vec1);
|
||||
reg2 = __msa_hadd_u_w(vec2, vec2);
|
||||
reg3 = __msa_hadd_u_w(vec3, vec3);
|
||||
reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
|
||||
reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
|
||||
reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
|
||||
reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
|
||||
vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
|
||||
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
|
||||
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
|
||||
ST_UB(dst0, dst);
|
||||
s += 64;
|
||||
t0 += 64;
|
||||
t1 += 64;
|
||||
t2 += 64;
|
||||
dst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
int x, width;
|
||||
uint64_t dst0;
|
||||
uint32_t dst1;
|
||||
v16u8 src0, src1, vec0;
|
||||
v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
|
||||
|
||||
assert(dst_width % 3 == 0);
|
||||
width = dst_width / 3;
|
||||
|
||||
for (x = 0; x < width; x += 4) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
|
||||
vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
|
||||
dst0 = __msa_copy_u_d((v2i64)vec0, 0);
|
||||
dst1 = __msa_copy_u_w((v4i32)vec0, 2);
|
||||
SD(dst0, dst);
|
||||
SW(dst1, dst + 8);
|
||||
src_ptr += 32;
|
||||
dst += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int x, width;
|
||||
const uint8_t* s = src_ptr;
|
||||
const uint8_t* t = src_ptr + src_stride;
|
||||
uint64_t dst0;
|
||||
uint32_t dst1;
|
||||
v16u8 src0, src1, src2, src3, out;
|
||||
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
|
||||
v8i16 zero = {0};
|
||||
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
|
||||
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
|
||||
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
|
||||
v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
width = dst_width / 3;
|
||||
|
||||
for (x = 0; x < width; x += 4) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
|
||||
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
|
||||
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
|
||||
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
|
||||
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
|
||||
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
|
||||
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
|
||||
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
|
||||
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
|
||||
vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
|
||||
vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
|
||||
vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
|
||||
vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
|
||||
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
|
||||
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
|
||||
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
|
||||
tmp0 = __msa_hadd_u_w(vec4, vec4);
|
||||
tmp1 = __msa_hadd_u_w(vec5, vec5);
|
||||
tmp2 = __msa_hadd_u_w(vec6, vec6);
|
||||
tmp3 = __msa_hadd_u_w(vec7, vec7);
|
||||
tmp4 = __msa_hadd_u_w(vec0, vec0);
|
||||
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
tmp0 = __msa_hadd_u_w(vec0, vec0);
|
||||
tmp1 = __msa_hadd_u_w(vec1, vec1);
|
||||
tmp0 *= const_0x2AAA;
|
||||
tmp1 *= const_0x2AAA;
|
||||
tmp4 *= const_0x4000;
|
||||
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
|
||||
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
|
||||
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
|
||||
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
|
||||
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
|
||||
dst0 = __msa_copy_u_d((v2i64)out, 0);
|
||||
dst1 = __msa_copy_u_w((v4i32)out, 2);
|
||||
SD(dst0, dst_ptr);
|
||||
SW(dst1, dst_ptr + 8);
|
||||
s += 32;
|
||||
t += 32;
|
||||
dst_ptr += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int x, width;
|
||||
const uint8_t* s = src_ptr;
|
||||
const uint8_t* t0 = s + src_stride;
|
||||
const uint8_t* t1 = s + src_stride * 2;
|
||||
uint64_t dst0;
|
||||
uint32_t dst1;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, out;
|
||||
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
|
||||
v8u16 zero = {0};
|
||||
v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
|
||||
v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
|
||||
v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
|
||||
v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
|
||||
|
||||
assert((dst_width % 3 == 0) && (dst_width > 0));
|
||||
width = dst_width / 3;
|
||||
|
||||
for (x = 0; x < width; x += 4) {
|
||||
src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
|
||||
src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
|
||||
src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
|
||||
src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
|
||||
src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
|
||||
src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
|
||||
vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
|
||||
vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
|
||||
vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
|
||||
vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
|
||||
vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
|
||||
vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
|
||||
vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
|
||||
vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
|
||||
vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
|
||||
vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
|
||||
vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
|
||||
vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
|
||||
vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
|
||||
vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
|
||||
vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
|
||||
vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
|
||||
vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
|
||||
vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
|
||||
vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
|
||||
vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
|
||||
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
|
||||
vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
|
||||
vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
|
||||
tmp0 = __msa_hadd_u_w(vec4, vec4);
|
||||
tmp1 = __msa_hadd_u_w(vec5, vec5);
|
||||
tmp2 = __msa_hadd_u_w(vec6, vec6);
|
||||
tmp3 = __msa_hadd_u_w(vec7, vec7);
|
||||
tmp4 = __msa_hadd_u_w(vec0, vec0);
|
||||
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
|
||||
tmp0 = __msa_hadd_u_w(vec0, vec0);
|
||||
tmp1 = __msa_hadd_u_w(vec1, vec1);
|
||||
tmp0 *= const_0x1C71;
|
||||
tmp1 *= const_0x1C71;
|
||||
tmp4 *= const_0x2AAA;
|
||||
tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
|
||||
tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
|
||||
tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
|
||||
vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
|
||||
vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
|
||||
out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
|
||||
dst0 = __msa_copy_u_d((v2i64)out, 0);
|
||||
dst1 = __msa_copy_u_w((v4i32)out, 2);
|
||||
SD(dst0, dst_ptr);
|
||||
SW(dst1, dst_ptr + 8);
|
||||
s += 32;
|
||||
t0 += 32;
|
||||
t1 += 32;
|
||||
dst_ptr += 12;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
|
||||
int x;
|
||||
v16u8 src0;
|
||||
v8u16 vec0, vec1, dst0, dst1;
|
||||
v16i8 zero = {0};
|
||||
|
||||
assert(src_width > 0);
|
||||
|
||||
for (x = 0; x < src_width; x += 16) {
|
||||
src0 = LD_UB(src_ptr);
|
||||
dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
|
||||
dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
|
||||
dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
|
||||
dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
|
||||
ST_UH2(dst0, dst1, dst_ptr, 8);
|
||||
src_ptr += 16;
|
||||
dst_ptr += 16;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user